In [None]:
# ===================================================================
# PHASE 4: SMART JOB PRIORITIZATION ENGINE
# Production-Level Code with ML-based Scoring and User Customization
# ===================================================================
# Purpose: Automatically prioritize job postings based on user profile
#          and preferences with optional user customization
# ===================================================================

from __future__ import annotations
import os
import json
import logging
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Tuple, Set
from dataclasses import dataclass, field, asdict
from collections import defaultdict
import pandas as pd
import numpy as np
import re

# ===================================================================
# LOGGING CONFIGURATION
# ===================================================================
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("job_prioritization.log"),
        logging.StreamHandler()
    ],
    force = True
)
logger = logging.getLogger("JobPrioritization")

# ===================================================================
# USER PROFILE DATA CLASS
# ===================================================================

@dataclass
class UserProfile:
    """
    User profile containing skills, preferences, and requirements.
    This is the foundation for personalized job matching.
    """
    
    # Personal Information
    user_id: str
    name: str = "User"
    
    # Skills (most important for matching)
    skills: List[str] = field(default_factory=list)
    primary_skills: List[str] = field(default_factory=list)  # Top 3-5 skills
    
    # Education
    education: List[str] = field(default_factory=list)
    graduation_year: Optional[int] = None
    
    # Experience
    experience_years: float = 0.0
    previous_roles: List[str] = field(default_factory=list)
    
    # Preferences
    preferred_locations: List[str] = field(default_factory=list)
    preferred_companies: List[str] = field(default_factory=list)
    preferred_work_mode: str = "Any"  # Remote, On-site, Hybrid, Any
    
    # Salary expectations
    min_expected_salary: int = 0
    max_expected_salary: int = 0
    
    # Constraints
    must_have_skills: List[str] = field(default_factory=list)
    deal_breaker_locations: List[str] = field(default_factory=list)
    
    def __post_init__(self):
        """Normalize and validate user profile data."""
        self.skills = [s.lower().strip() for s in self.skills if s.strip()]
        self.primary_skills = [s.lower().strip() for s in self.primary_skills if s.strip()]
        self.education = [e.upper().strip() for e in self.education if e.strip()]
        self.preferred_locations = [loc.title().strip() for loc in self.preferred_locations]
        self.preferred_companies = [c.strip() for c in self.preferred_companies]
    
    @classmethod
    def from_dict(cls, data: Dict) -> UserProfile:
        """Create UserProfile from dictionary."""
        return cls(**data)
    
    @classmethod
    def create_sample_profile(cls) -> UserProfile:
        """Create a sample user profile for testing."""
        return cls(
            user_id="USER_001",
            name="Sample User",
            skills=["python", "java", "sql", "machine learning", "data analysis"],
            primary_skills=["python", "machine learning", "sql"],
            education=["B.TECH"],
            graduation_year=2024,
            experience_years=0.0,
            previous_roles=[],
            preferred_locations=["Bangalore", "Pune", "Remote"],
            preferred_companies=["Google", "Microsoft", "Amazon"],
            preferred_work_mode="Hybrid",
            min_expected_salary=400000,
            max_expected_salary=800000,
            must_have_skills=["python"],
            deal_breaker_locations=[]
        )


# ===================================================================
# PRIORITIZATION WEIGHTS CONFIGURATION
# ===================================================================

@dataclass
class PrioritizationWeights:
    """
    Configurable weights for job prioritization scoring.
    Allows users to customize what matters most to them.
    """
    
    # Profile Match Weights (Total: 40%)
    skills_match_weight: float = 0.15
    experience_match_weight: float = 0.10
    education_match_weight: float = 0.10
    location_match_weight: float = 0.05
    
    # Job Quality Weights (Total: 30%)
    completeness_weight: float = 0.10
    salary_competitiveness_weight: float = 0.10
    company_reputation_weight: float = 0.10
    
    # Urgency Weights (Total: 20%)
    deadline_urgency_weight: float = 0.15
    posting_freshness_weight: float = 0.05
    
    # User Preference Weights (Total: 10%)
    preference_bonus_weight: float = 0.10
    
    def __post_init__(self):
        """Validate that weights sum to ~1.0."""
        total = sum([
            self.skills_match_weight,
            self.experience_match_weight,
            self.education_match_weight,
            self.location_match_weight,
            self.completeness_weight,
            self.salary_competitiveness_weight,
            self.company_reputation_weight,
            self.deadline_urgency_weight,
            self.posting_freshness_weight,
            self.preference_bonus_weight
        ])
        
        if not (0.95 <= total <= 1.05):
            logger.warning(f"Weights sum to {total:.2f}, expected ~1.0")
    
    @classmethod
    def get_default(cls) -> PrioritizationWeights:
        """Get default weights configuration."""
        return cls()
    
    def to_dict(self) -> Dict[str, float]:
        """Convert to dictionary for serialization."""
        return asdict(self)


# ===================================================================
# JOB SCORING COMPONENTS
# ===================================================================

class JobScorer:
    """
    Calculates individual scoring components for job prioritization.
    Each method returns a score between 0.0 and 1.0.
    """
    
    def __init__(self, user_profile: UserProfile):
        """
        Initialize scorer with user profile.
        
        Args:
            user_profile: User's profile information
        """
        self.user = user_profile
        self.logger = logging.getLogger("JobScorer")
    
    def calculate_skills_match(self, job: pd.Series) -> float:
        """
        Calculate how well job skills match user skills.
        
        Scoring:
        - Primary skills match: 1.0 point each
        - Other skills match: 0.5 point each
        - Normalized by total required skills
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            job_skills_str = str(job.get('skills_required', ''))
            if not job_skills_str or job_skills_str == 'nan':
                return 0.0
            
            # Parse job skills
            job_skills = set([
                s.lower().strip() 
                for s in job_skills_str.split(',') 
                if s.strip()
            ])
            
            if not job_skills:
                return 0.0
            
            # Convert user skills to sets
            user_skills = set(self.user.skills)
            primary_skills = set(self.user.primary_skills)
            
            # Calculate matches
            primary_matches = job_skills & primary_skills
            other_matches = (job_skills & user_skills) - primary_matches
            
            # Score calculation
            score = 0.0
            score += len(primary_matches) * 1.0
            score += len(other_matches) * 0.5
            
            # Normalize by job requirements (max score = number of job skills)
            max_score = len(job_skills)
            normalized_score = min(score / max_score, 1.0) if max_score > 0 else 0.0
            
            return normalized_score
            
        except Exception as e:
            self.logger.error(f"Error calculating skills match: {e}")
            return 0.0
    
    def calculate_experience_match(self, job: pd.Series) -> float:
        """
        Calculate how well user's experience matches job requirements.
        
        Scoring logic:
        - Perfect match: 1.0
        - Within range: 1.0
        - Slightly under-qualified: 0.7
        - Over-qualified: 0.8
        - Far from requirement: 0.3
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            exp_min = float(job.get('experience_min_years', 0))
            exp_max = float(job.get('experience_max_years', 0))
            user_exp = self.user.experience_years
            
            # Fresher positions
            if exp_max == 0:
                return 1.0 if user_exp <= 2 else 0.7
            
            # User experience within range
            if exp_min <= user_exp <= exp_max:
                return 1.0
            
            # Slightly under-qualified (within 1 year)
            if user_exp < exp_min and (exp_min - user_exp) <= 1:
                return 0.7
            
            # Over-qualified but acceptable (within 2 years)
            if user_exp > exp_max and (user_exp - exp_max) <= 2:
                return 0.8
            
            # Far from requirements
            if user_exp < exp_min:
                gap = exp_min - user_exp
                return max(0.3, 1.0 - (gap * 0.2))
            else:
                gap = user_exp - exp_max
                return max(0.5, 1.0 - (gap * 0.15))
                
        except Exception as e:
            self.logger.error(f"Error calculating experience match: {e}")
            return 0.5
    
    def calculate_education_match(self, job: pd.Series) -> float:
        """
        Calculate education requirement match.
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            job_education_str = str(job.get('education_required', ''))
            if not job_education_str or job_education_str == 'nan':
                return 0.8  # No requirement = acceptable
            
            # Parse job education requirements
            job_degrees = set([
                deg.upper().strip() 
                for deg in job_education_str.split(',') 
                if deg.strip()
            ])
            
            if not job_degrees:
                return 0.8
            
            # Check if user's education matches
            user_degrees = set(self.user.education)
            
            if job_degrees & user_degrees:
                return 1.0  # Perfect match
            
            # Check for equivalent degrees
            equivalents = {
                'B.TECH': {'B.E', 'BTECH', 'BE'},
                'M.TECH': {'M.E', 'MTECH', 'ME'},
                'BCA': {'B.SC', 'BSC'},
                'MCA': {'M.SC', 'MSC'}
            }
            
            for user_deg in user_degrees:
                for job_deg in job_degrees:
                    if job_deg in equivalents.get(user_deg, set()):
                        return 0.9  # Equivalent degree
            
            return 0.3  # No match
            
        except Exception as e:
            self.logger.error(f"Error calculating education match: {e}")
            return 0.5
    
    def calculate_location_match(self, job: pd.Series) -> float:
        """
        Calculate location preference match.
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            job_location = str(job.get('location_city', '')).strip().title()
            work_mode = str(job.get('work_mode', 'On-site')).strip()
            
            # Deal-breaker check
            if job_location in self.user.deal_breaker_locations:
                return 0.0
            
            # Remote work preference
            if work_mode == 'Remote' and self.user.preferred_work_mode in ['Remote', 'Hybrid', 'Any']:
                return 1.0
            
            # Check preferred locations
            if job_location in self.user.preferred_locations:
                return 1.0
            
            # Hybrid acceptable
            if work_mode == 'Hybrid' and self.user.preferred_work_mode in ['Hybrid', 'Any']:
                return 0.8
            
            # No preference specified
            if not self.user.preferred_locations:
                return 0.6
            
            return 0.4  # Not in preferred locations
            
        except Exception as e:
            self.logger.error(f"Error calculating location match: {e}")
            return 0.5
    
    def calculate_completeness_score(self, job: pd.Series) -> float:
        """
        Use the job's existing completeness score.
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Completeness score (0.0 to 1.0)
        """
        try:
            return float(job.get('completeness_score', 0.5))
        except:
            return 0.5
    
    def calculate_salary_competitiveness(self, job: pd.Series) -> float:
        """
        Calculate how competitive the salary is compared to expectations.
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            job_salary = float(job.get('salary_max', 0))
            
            # No salary information
            if job_salary == 0:
                return 0.5  # Neutral score
            
            # No expectations set
            if self.user.max_expected_salary == 0:
                return 0.7  # Default positive
            
            # Calculate relative to expectations
            expected_min = self.user.min_expected_salary
            expected_max = self.user.max_expected_salary
            
            # Above expectations
            if job_salary >= expected_max:
                return 1.0
            
            # Within range
            if expected_min <= job_salary < expected_max:
                # Linear interpolation within range
                range_size = expected_max - expected_min
                position = job_salary - expected_min
                return 0.7 + (0.3 * (position / range_size))
            
            # Below minimum but close
            if job_salary >= expected_min * 0.8:
                return 0.6
            
            # Below expectations
            return 0.4
            
        except Exception as e:
            self.logger.error(f"Error calculating salary competitiveness: {e}")
            return 0.5
    
    def calculate_company_reputation(self, job: pd.Series) -> float:
        """
        Calculate company reputation score.
        
        Based on:
        - User's preferred companies list
        - Company name recognition patterns
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            company_name = str(job.get('company_name', '')).strip()
            
            if not company_name or company_name == 'nan':
                return 0.5
            
            # Exact match with preferred companies
            if company_name in self.user.preferred_companies:
                return 1.0
            
            # Partial match (case-insensitive)
            company_lower = company_name.lower()
            for preferred in self.user.preferred_companies:
                if preferred.lower() in company_lower or company_lower in preferred.lower():
                    return 0.9
            
            # Well-known companies (generic list)
            top_companies = {
                'google', 'microsoft', 'amazon', 'apple', 'meta', 'facebook',
                'netflix', 'adobe', 'oracle', 'ibm', 'intel', 'nvidia',
                'tcs', 'infosys', 'wipro', 'hcl', 'cognizant', 'accenture'
            }
            
            if any(top_comp in company_lower for top_comp in top_companies):
                return 0.8
            
            # Default score for unknown companies
            return 0.6
            
        except Exception as e:
            self.logger.error(f"Error calculating company reputation: {e}")
            return 0.5
    
    def calculate_deadline_urgency(self, job: pd.Series) -> float:
        """
        Calculate urgency based on application deadline.
        
        Scoring:
        - No deadline: 0.5 (neutral)
        - > 30 days: 0.3 (low urgency)
        - 15-30 days: 0.5 (medium)
        - 7-15 days: 0.8 (high)
        - < 7 days: 1.0 (very high)
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            deadline_str = str(job.get('application_deadline', ''))
            
            if not deadline_str or deadline_str == 'nan' or deadline_str == 'None':
                return 0.5  # No deadline = medium priority
            
            # Parse deadline
            try:
                deadline = datetime.strptime(deadline_str, '%Y-%m-%d')
                days_remaining = (deadline - datetime.now()).days
                
                if days_remaining < 0:
                    return 0.0  # Expired
                elif days_remaining <= 7:
                    return 1.0  # Very urgent
                elif days_remaining <= 15:
                    return 0.8  # High urgency
                elif days_remaining <= 30:
                    return 0.5  # Medium urgency
                else:
                    return 0.3  # Low urgency
                    
            except:
                return 0.5
                
        except Exception as e:
            self.logger.error(f"Error calculating deadline urgency: {e}")
            return 0.5
    
    def calculate_posting_freshness(self, job: pd.Series) -> float:
        """
        Calculate how recent the job posting is.
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            timestamp_str = str(job.get('extraction_timestamp', ''))
            
            if not timestamp_str or timestamp_str.lower() == 'nan':
                return 0.6  # Unknown = medium score
            
            try:
                posting_time = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
                posting_time_naive = posting_time.replace(tzinfo=None)
                now_naive = datetime.now().replace(tzinfo=None)

                days_old = (now_naive - posting_time_naive).days
                
                if days_old <= 3:
                    return 1.0  # Very fresh
                elif days_old <= 7:
                    return 0.8  # Fresh
                elif days_old <= 14:
                    return 0.6  # Recent
                elif days_old <= 30:
                    return 0.4  # Older
                else:
                    return 0.2  # Old
                    
            except:
                return 0.6
                
        except Exception as e:
            self.logger.error(f"Error calculating posting freshness: {e}")
            return 0.5
    
    def calculate_preference_bonus(self, job: pd.Series) -> float:
        """
        Calculate bonus score based on multiple preference matches.
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Score between 0.0 and 1.0
        """
        try:
            bonus = 0.0
            
            # Preferred company bonus
            company_name = str(job.get('company_name', ''))
            if company_name in self.user.preferred_companies:
                bonus += 0.4
            
            # Preferred location bonus
            location = str(job.get('location_city', ''))
            if location in self.user.preferred_locations:
                bonus += 0.3
            
            # Work mode match bonus
            work_mode = str(job.get('work_mode', ''))
            if work_mode == self.user.preferred_work_mode or self.user.preferred_work_mode == 'Any':
                bonus += 0.3
            
            return min(bonus, 1.0)
            
        except Exception as e:
            self.logger.error(f"Error calculating preference bonus: {e}")
            return 0.0


# ===================================================================
# SMART PRIORITIZATION ENGINE
# ===================================================================

class SmartPrioritizationEngine:
    """
    Main prioritization engine that orchestrates scoring and ranking.
    Combines all scoring components with configurable weights.
    """
    
    def __init__(
        self,
        user_profile: UserProfile,
        weights: Optional[PrioritizationWeights] = None
    ):
        """
        Initialize prioritization engine.
        
        Args:
            user_profile: User's profile information
            weights: Custom weights (optional, uses defaults if None)
        """
        self.user_profile = user_profile
        self.weights = weights or PrioritizationWeights.get_default()
        self.scorer = JobScorer(user_profile)
        self.logger = logging.getLogger("PrioritizationEngine")
        
        self.logger.info("="*70)
        self.logger.info("SMART PRIORITIZATION ENGINE INITIALIZED")
        self.logger.info("="*70)
        self.logger.info(f"User: {user_profile.name} ({user_profile.user_id})")
        self.logger.info(f"Skills: {', '.join(user_profile.primary_skills[:5])}")
        self.logger.info(f"Experience: {user_profile.experience_years} years")
        self.logger.info("="*70 + "\n")
    
    def calculate_job_priority(self, job: pd.Series) -> Dict[str, float]:
        """
        Calculate comprehensive priority score for a job.
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            Dictionary with score breakdown and final priority
        """
        try:
            # Calculate all scoring components
            scores = {
                'skills_match': self.scorer.calculate_skills_match(job),
                'experience_match': self.scorer.calculate_experience_match(job),
                'education_match': self.scorer.calculate_education_match(job),
                'location_match': self.scorer.calculate_location_match(job),
                'completeness': self.scorer.calculate_completeness_score(job),
                'salary_competitiveness': self.scorer.calculate_salary_competitiveness(job),
                'company_reputation': self.scorer.calculate_company_reputation(job),
                'deadline_urgency': self.scorer.calculate_deadline_urgency(job),
                'posting_freshness': self.scorer.calculate_posting_freshness(job),
                'preference_bonus': self.scorer.calculate_preference_bonus(job)
            }
            
            # Calculate weighted total (0-1 scale)
            final_score = (
                scores['skills_match'] * self.weights.skills_match_weight +
                scores['experience_match'] * self.weights.experience_match_weight +
                scores['education_match'] * self.weights.education_match_weight +
                scores['location_match'] * self.weights.location_match_weight +
                scores['completeness'] * self.weights.completeness_weight +
                scores['salary_competitiveness'] * self.weights.salary_competitiveness_weight +
                scores['company_reputation'] * self.weights.company_reputation_weight +
                scores['deadline_urgency'] * self.weights.deadline_urgency_weight +
                scores['posting_freshness'] * self.weights.posting_freshness_weight +
                scores['preference_bonus'] * self.weights.preference_bonus_weight
            )
            
            # Apply must-have filters (hard constraints)
            if not self._check_must_have_requirements(job):
                final_score *= 0.3  # Severe penalty for missing must-haves
            
            # Scale to 0-100
            scores['final_priority_score'] = final_score * 100
            scores['priority_tier'] = self._get_priority_tier(scores['final_priority_score'])
            
            return scores
            
        except Exception as e:
            self.logger.error(f"Error calculating job priority: {e}")
            return {'final_priority_score': 0.0, 'priority_tier': 'Low'}
    
    def _check_must_have_requirements(self, job: pd.Series) -> bool:
        """
        Check if job meets must-have requirements.
        
        Args:
            job: Job posting as pandas Series
            
        Returns:
            True if requirements met, False otherwise
        """
        try:
            # Check must-have skills
            if self.user_profile.must_have_skills:
                job_skills_str = str(job.get('skills_required', '')).lower()
                for must_have_skill in self.user_profile.must_have_skills:
                    if must_have_skill.lower() not in job_skills_str:
                        return False
            return True
            
        except Exception as e:
            self.logger.error(f"Error checking must-have requirements: {e}")
            return True
    
    def _get_priority_tier(self, score: float) -> str:
        """
        Convert numerical score to priority tier.
        
        Args:
            score: Priority score (0-100)
            
        Returns:
            Priority tier string
        """
        if score >= 80:
            return "Must Apply"
        elif score >= 65:
            return "High Priority"
        elif score >= 50:
            return "Medium Priority"
        elif score >= 35:
            return "Low Priority"
        else:
            return "Not Recommended"
    
    def prioritize_jobs(
        self,
        jobs_df: pd.DataFrame,
        save_output: bool = True,
        output_path: str = "prioritized_jobs.csv"
    ) -> pd.DataFrame:
        """
        Prioritize all jobs in the dataset.
        
        Args:
            jobs_df: DataFrame with job postings
            save_output: Whether to save output CSV
            output_path: Path for output file
            
        Returns:
            DataFrame with priority scores added and sorted
        """
        self.logger.info(f"Prioritizing {len(jobs_df)} job postings...")
        
        # Calculate priority for each job
        priority_data = []
        
        for idx, job in jobs_df.iterrows():
            try:
                scores = self.calculate_job_priority(job)
                priority_data.append(scores)
                
                # Log progress every 50 jobs
                if (idx + 1) % 50 == 0:
                    self.logger.info(f"   Processed {idx+1}/{len(jobs_df)} jobs")
                    
            except Exception as e:
                self.logger.error(f"Error processing job {idx}: {e}")
                priority_data.append({'final_priority_score': 0.0, 'priority_tier': 'Low'})
        
        # Add priority scores to DataFrame
        priority_df = pd.DataFrame(priority_data)
        result_df = pd.concat([jobs_df.reset_index(drop=True), priority_df], axis=1)
        
        # Sort by priority score
        result_df = result_df.sort_values('final_priority_score', ascending=False)
        result_df = result_df.reset_index(drop=True)
        
        # Generate statistics
        self._generate_priority_statistics(result_df)
        
        # Save output
        if save_output:
            try:
                result_df.to_csv(output_path, index=False)
                self.logger.info(f"Saved prioritized jobs to: {output_path}\n")
            except Exception as e:
                self.logger.error(f"Error saving output: {e}")
        
        return result_df
    
    def _generate_priority_statistics(self, df: pd.DataFrame):
        """
        Generate and log priority statistics.
        
        Args:
            df: Prioritized DataFrame
        """
        self.logger.info(f"\n{'='*70}")
        self.logger.info(f"PRIORITIZATION STATISTICS")
        self.logger.info(f"{'='*70}\n")
        
        # Overall statistics
        self.logger.info(f"OVERALL STATISTICS:")
        self.logger.info(f"   Total Jobs: {len(df)}")
        self.logger.info(f"   Average Priority Score: {df['final_priority_score'].mean():.1f}/100")
        self.logger.info(f"   Median Priority Score: {df['final_priority_score'].median():.1f}/100")
        self.logger.info(f"   Highest Score: {df['final_priority_score'].max():.1f}/100")
        self.logger.info(f"   Lowest Score: {df['final_priority_score'].min():.1f}/100")
        
        # Priority tier distribution
        self.logger.info(f"\nPRIORITY TIER DISTRIBUTION:")
        tier_counts = df['priority_tier'].value_counts()
        for tier, count in tier_counts.items():
            percentage = (count / len(df)) * 100
            self.logger.info(f"   {tier}: {count} jobs ({percentage:.1f}%)")
        
        # Top scoring components
        self.logger.info(f"\nAVERAGE COMPONENT SCORES:")
        components = [
            ('Skills Match', 'skills_match'),
            ('Experience Match', 'experience_match'),
            ('Education Match', 'education_match'),
            ('Location Match', 'location_match'),
            ('Salary Competitiveness', 'salary_competitiveness'),
            ('Company Reputation', 'company_reputation')
        ]
        
        for name, col in components:
            if col in df.columns:
                avg_score = df[col].mean() * 100
                self.logger.info(f"   {name}: {avg_score:.1f}%")
        
        # Top 10 jobs
        self.logger.info(f"\nTOP 10 RECOMMENDED JOBS:")
        self.logger.info(f"{'‚îÄ'*70}")
        
        for idx, (_, job) in enumerate(df.head(10).iterrows(), 1):
            self.logger.info(
                f"\n#{idx} | Score: {job['final_priority_score']:.1f}/100 | "
                f"{job['priority_tier']}"
            )
            self.logger.info(f"   Position: {job['position_title']}")
            self.logger.info(f"   Company: {job['company_name']}")
            self.logger.info(f"   Location : {job['location_city']} ({job['work_mode']})")
            skills_preview = str(job['skills_required'])[:50]
            self.logger.info(f"   Skills: {skills_preview}...")
        
            if job['salary_max'] > 0:
                salary_lpa = job['salary_max'] / 100000
                self.logger.info(f"   Salary: ‚Çπ{salary_lpa:.1f} LPA")
    
        self.logger.info(f"\n{'='*70}\n")

    def get_recommended_jobs(
        self,
        df: pd.DataFrame,
        min_score: float = 65.0,
        max_results: int = 20
    ) -> pd.DataFrame:
        """
        Get top recommended jobs above threshold.
        
        Args:
            df: Prioritized DataFrame
            min_score: Minimum priority score
            max_results: Maximum number of results
            
        Returns:
            Filtered DataFrame with top recommendations
        """
        filtered = df[df['final_priority_score'] >= min_score]

        if not filtered.empty:
            self.logger.info(f"found {len(filtered)} jobs above score {min_score}")
            return filtered.head(max_results)
        
        # Attemp 2 : Fall back
        self.logger.warning(f"No jobs found above score {min_score}. Returning top {max_results} available jobs instead.")
        return df.head(max_results)
    
# ===================================================================
# USER CUSTOMIZATION INTERFACE
# ===================================================================
class UserCustomization:
    """
    Allows users to customize prioritization weights and preferences.
    Provides methods for adjusting scoring parameters.
    """
    def __init__(self, engine: SmartPrioritizationEngine):
        """
        Initialize customization interface.
        
        Args:
            engine: Prioritization engine instance
        """
        self.engine = engine
        self.logger = logging.getLogger("UserCustomization")
        self.original_weights = PrioritizationWeights.get_default()

    def adjust_weight(
        self,
        component: str,
        new_weight: float,
        redistribute: bool = True
    ):
        """
        Adjust weight for a specific component.
        
        Args:
            component: Component name (e.g., 'skills_match_weight')
            new_weight: New weight value (0.0 to 1.0)
            redistribute: Whether to redistribute other weights
        """
        if not hasattr(self.engine.weights, component):
            self.logger.error(f"Unknown component: {component}")
            return
        
        if not (0.0 <= new_weight <= 1.0):
            self.logger.error(f"Weight must be between 0.0 and 1.0")
            return
        
        old_weight = getattr(self.engine.weights, component)
        setattr(self.engine.weights, component, new_weight)
        
        self.logger.info(
            f"‚úì Adjusted {component}: {old_weight:.2f} ‚Üí {new_weight:.2f}"
        )
        
        if redistribute:
            self._redistribute_weights(component)

    def _redistribute_weights(self, changed_component: str):
        """
        Redistribute weights to maintain sum ~1.0.
        
        Args:
            changed_component: Component that was changed
        """
        # Get all weight attributes
        weight_attrs = [
            attr for attr in dir(self.engine.weights)
            if attr.endswith('_weight') and not attr.startswith('_')
        ]
        
        # Calculate current total
        total = sum(getattr(self.engine.weights, attr) for attr in weight_attrs)
        
        # If total is significantly off, normalize all weights except changed one
        if not (0.95 <= total <= 1.05):
            other_attrs = [attr for attr in weight_attrs if attr != changed_component]
            other_total = sum(getattr(self.engine.weights, attr) for attr in other_attrs)
            
            if other_total > 0:
                scale_factor = (1.0 - getattr(self.engine.weights, changed_component)) / other_total
                for attr in other_attrs:
                    current = getattr(self.engine.weights, attr)
                    setattr(self.engine.weights, attr, current * scale_factor)

    def boost_component(self, component: str, multiplier: float = 1.5):
        """
        Boost a component's weight by a multiplier.
        
        Args:
            component: Component name
            multiplier: Multiplication factor
        """
        if not hasattr(self.engine.weights, component):
            self.logger.error(f"Unknown component: {component}")
            return
        
        current = getattr(self.engine.weights, component)
        new_weight = min(current * multiplier, 0.5)  # Cap at 0.5
        self.adjust_weight(component, new_weight)

    def penalize_component(self, component: str, multiplier: float = 0.5):
        """
        Penalize a component's weight by a multiplier.
        
        Args:
            component: Component name
            multiplier: Multiplication factor
        """
        if not hasattr(self.engine.weights, component):
            self.logger.error(f"Unknown component: {component}")
            return
        
        current = getattr(self.engine.weights, component)
        new_weight = max(current * multiplier, 0.01)  # Min at 0.01
        self.adjust_weight(component, new_weight)

    def reset_weights(self):
        """Reset all weights to default values."""
        self.engine.weights = PrioritizationWeights.get_default()
        self.logger.info("‚úì Reset all weights to default values")

    def save_preferences(self, filepath: str = "user_preferences.json"):
        """
        Save current weights and preferences to file.
        
        Args:
            filepath: Path to save preferences
        """
        try:
            preferences = {
                'weights': self.engine.weights.to_dict(),
                'user_profile': asdict(self.engine.user_profile),
                'saved_at': datetime.now().isoformat()
            }
            
            with open(filepath, 'w') as f:
                json.dump(preferences, f, indent=2)
            
            self.logger.info(f"Saved preferences to: {filepath}")
            
        except Exception as e:
                self.logger.error(f"Error saving preferences: {e}")

    def load_preferences(self, filepath: str = "user_preferences.json"):
        """
        Load weights and preferences from file.
        
        Args:
            filepath: Path to load preferences from
        """
        try:
            with open(filepath, 'r') as f:
                preferences = json.load(f)
            
            # Load weights
            weights_dict = preferences.get('weights', {})
            self.engine.weights = PrioritizationWeights(**weights_dict)
            
            self.logger.info(f"Loaded preferences from: {filepath}")
            
        except Exception as e:
            self.logger.error(f"Error loading preferences: {e}")

# ===================================================================
# MAIN EXECUTION PIPELINE
# ===================================================================
def main(
    jobs_csv: str = "structured_job_postings.csv" ,
    user_profile: Optional[UserProfile] = None,
    custom_weights: Optional[PrioritizationWeights] = None
    ) -> Tuple[pd.DataFrame, SmartPrioritizationEngine]:
    """
    Main execution function for job prioritization.
    Args:
    jobs_csv: Path to structured jobs CSV
    user_profile: User profile (creates sample if None)
    custom_weights: Custom weights (uses defaults if None)
    
Returns:
    Tuple of (prioritized DataFrame, engine instance)
"""

    logger.info("\n" + "="*70)
    logger.info("STARTING JOB PRIORITIZATION PIPELINE")
    logger.info("="*70 + "\n")

    try:
        # Load jobs data
        logger.info(f"Loading jobs from: {jobs_csv}")
        # Verify file exists
        if not os.path.exists(jobs_csv):
            raise FileNotFoundError(f"File not found at: {jobs_csv}")
        
        jobs_df = pd.read_csv(jobs_csv)
        logger.info(f"Loaded {len(jobs_df)} job postings\n")
        
        # Create or use provided user profile
        if user_profile is None:
            logger.info("Creating sample user profile...")
            user_profile = UserProfile.create_sample_profile()
        
        # Initialize prioritization engine
        engine = SmartPrioritizationEngine(
            user_profile=user_profile,
            weights=custom_weights
        )
        
        # Prioritize jobs
        prioritized_df = engine.prioritize_jobs(
            jobs_df=jobs_df,
            save_output=True,
            output_path="prioritized_jobs.csv"
        )
        
        # Get top recommendations
        logger.info("Generating recommendations...")
        recommendations = engine.get_recommended_jobs(
            df=prioritized_df,
            min_score=65.0,
            max_results=20
        )
        
        logger.info(f"Found {len(recommendations)} high-priority recommendations\n")
        
        # Save recommendations separately
        recommendations.to_csv("top_recommendations.csv", index=False)
        logger.info("Saved top recommendations to: top_recommendations.csv\n")
        
        logger.info("="*70)
        logger.info("PRIORITIZATION PIPELINE COMPLETED")
        logger.info("="*70)
        logger.info("Output Files:")
        logger.info("   - prioritized_jobs.csv (all jobs with scores)")
        logger.info("   - top_recommendations.csv (high-priority jobs)")
        logger.info("   - job_prioritization.log (execution log)")
        logger.info("="*70 + "\n")
        
        return prioritized_df, engine
        
    except Exception as e:
        logger.error(f"PIPELINE FAILED: {e}")
        raise


# ===================================================================
# EXAMPLE USAGE WITH CUSTOMIZATION
# ===================================================================
def example_with_customization():
    """
    Example showing how to use customization features.
    """
    logger.info("\n" + "="*70)
    logger.info("EXAMPLE: USER CUSTOMIZATION")
    logger.info("="*70 + "\n")

    # Create user profile
    user = UserProfile(
        user_id="USER_123",
        name="John Doe",
        skills=["python", "machine learning", "data analysis", "sql"],
        primary_skills=["python", "machine learning"],
        education=["B.TECH"],
        experience_years=1.0,
        preferred_locations=["Bangalore", "Remote"],
        preferred_companies=["Google", "Microsoft"],
        min_expected_salary=500000,
        max_expected_salary=800000,
        must_have_skills=["python"]
    )

    # Run prioritization
    csv_path = r"D:\Projects By Month\November 2025\Placement Mail Analysis System\.venv\Phase_scripts\Phase 3\structured_job_postings.csv"

    prioritized_df, engine = main(
        jobs_csv=csv_path,
        user_profile=user
    )

    # Initialize customization interface
    customizer = UserCustomization(engine)

    # Example: User wants to prioritize salary more
    logger.info("User customization: Boosting salary importance...")
    customizer.boost_component('salary_competitiveness_weight', multiplier=1.5)

    # Example: User cares less about company reputation
    logger.info("User customization: Reducing company reputation importance...")
    customizer.penalize_component('company_reputation_weight', multiplier=0.7)

    # Re-prioritize with new weights
    logger.info("\nRe-prioritizing with custom weights...\n")
    reprioritized_df = engine.prioritize_jobs(
        jobs_df=prioritized_df,
        save_output=True,
        output_path="custom_prioritized_jobs.csv"
    )

    # Save custom preferences
    customizer.save_preferences("my_preferences.json")

    logger.info("Customization example complete!\n")

# ===================================================================
# ENTRY POINT
# =========================================
if __name__ == "__main__":
    # Option 1: Run with default settings
    # Create user profile
    user = UserProfile.create_sample_profile()

    path = r"D:\Projects By Month\November 2025\Placement Mail Analysis System\.venv\Phase_scripts\Phase 3\structured_job_postings.csv"

    # Run prioritization
    prioritized_df, engine = main(
        jobs_csv=path,
        user_profile=user
    )

    # Display sample of top jobs
    print("\n" + "="*70)
    print("TOP 5 RECOMMENDED JOBS:")
    print("="*70)

    if len(prioritized_df) > 0:
        top_5 = prioritized_df.head(5)
        for idx, job in top_5.iterrows():
            print(f"\n#{idx+1} | Score: {job['final_priority_score']:.1f}/100")
            print(f"   {job['position_title']} at {job['company_name']}")
            print(f"   Location: {job['location_city']} | Salary: ‚Çπ{job['salary_max']/100000:.1f} LPA")
            print(f"   Skills: {str(job['skills_required'])[:60]}...")

    print("\n" + "="*70)

In [None]:
# ===================================================================
# PHASE 4: SMART JOB PRIORITIZATION ENGINE
# Production-Level Code with ML-based Scoring and User Customization
# ===================================================================
# Purpose: Automatically prioritize job postings based on user profile
#          and preferences with optional user customization
# ===================================================================

from __future__ import annotations
import os
import json
import logging
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Tuple, Set
from dataclasses import dataclass, field, asdict
from collections import defaultdict
import pandas as pd
import numpy as np
import re

# ===================================================================
# LOGGING CONFIGURATION
# ===================================================================
# force=True is essential in Jupyter to reset handlers on re-runs
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("job_prioritization.log"),
        logging.StreamHandler()
    ],
    force=True 
)
logger = logging.getLogger("JobPrioritization")

# ===================================================================
# USER PROFILE DATA CLASS
# ===================================================================

@dataclass
class UserProfile:
    """
    User profile containing skills, preferences, and requirements.
    This is the foundation for personalized job matching.
    """
    
    # Personal Information
    user_id: str
    name: str = "User"
    
    # Skills (most important for matching)
    skills: List[str] = field(default_factory=list)
    primary_skills: List[str] = field(default_factory=list)  # Top 3-5 skills
    
    # Education
    education: List[str] = field(default_factory=list)
    graduation_year: Optional[int] = None
    
    # Experience
    experience_years: float = 0.0
    previous_roles: List[str] = field(default_factory=list)
    
    # Preferences
    preferred_locations: List[str] = field(default_factory=list)
    preferred_companies: List[str] = field(default_factory=list)
    preferred_work_mode: str = "Any"  # Remote, On-site, Hybrid, Any
    
    # Salary expectations
    min_expected_salary: int = 0
    max_expected_salary: int = 0
    
    # Constraints
    must_have_skills: List[str] = field(default_factory=list)
    deal_breaker_locations: List[str] = field(default_factory=list)
    
    def __post_init__(self):
        """Normalize and validate user profile data."""
        self.skills = [s.lower().strip() for s in self.skills if s.strip()]
        self.primary_skills = [s.lower().strip() for s in self.primary_skills if s.strip()]
        self.education = [e.upper().strip() for e in self.education if e.strip()]
        self.preferred_locations = [loc.title().strip() for loc in self.preferred_locations]
        self.preferred_companies = [c.strip() for c in self.preferred_companies]
    
    @classmethod
    def from_dict(cls, data: Dict) -> UserProfile:
        """Create UserProfile from dictionary."""
        return cls(**data)
    
    @classmethod
    def create_sample_profile(cls) -> UserProfile:
        """Create a sample user profile for testing."""
        return cls(
            user_id="USER_001",
            name="Sample User",
            skills=["python", "java", "sql", "machine learning", "data analysis"],
            primary_skills=["python", "machine learning", "sql"],
            education=["B.TECH"],
            graduation_year=2024,
            experience_years=0.0,
            previous_roles=[],
            preferred_locations=["Bangalore", "Pune", "Remote"],
            preferred_companies=["Google", "Microsoft", "Amazon"],
            preferred_work_mode="Hybrid",
            min_expected_salary=400000,
            max_expected_salary=800000,
            must_have_skills=["python"],
            deal_breaker_locations=[]
        )


# ===================================================================
# PRIORITIZATION WEIGHTS CONFIGURATION
# ===================================================================

@dataclass
class PrioritizationWeights:
    """
    Configurable weights for job prioritization scoring.
    Allows users to customize what matters most to them.
    """
    
    # Profile Match Weights (Total: 40%)
    skills_match_weight: float = 0.15
    experience_match_weight: float = 0.10
    education_match_weight: float = 0.10
    location_match_weight: float = 0.05
    
    # Job Quality Weights (Total: 30%)
    completeness_weight: float = 0.10
    salary_competitiveness_weight: float = 0.10
    company_reputation_weight: float = 0.10
    
    # Urgency Weights (Total: 20%)
    deadline_urgency_weight: float = 0.15
    posting_freshness_weight: float = 0.05
    
    # User Preference Weights (Total: 10%)
    preference_bonus_weight: float = 0.10
    
    def __post_init__(self):
        """Validate that weights sum to ~1.0."""
        total = sum([
            self.skills_match_weight,
            self.experience_match_weight,
            self.education_match_weight,
            self.location_match_weight,
            self.completeness_weight,
            self.salary_competitiveness_weight,
            self.company_reputation_weight,
            self.deadline_urgency_weight,
            self.posting_freshness_weight,
            self.preference_bonus_weight
        ])
        
        if not (0.95 <= total <= 1.05):
            logger.warning(f"Weights sum to {total:.2f}, expected ~1.0")
    
    @classmethod
    def get_default(cls) -> PrioritizationWeights:
        """Get default weights configuration."""
        return cls()
    
    def to_dict(self) -> Dict[str, float]:
        """Convert to dictionary for serialization."""
        return asdict(self)


# ===================================================================
# JOB SCORING COMPONENTS
# ===================================================================

class JobScorer:
    """
    Calculates individual scoring components for job prioritization.
    Each method returns a score between 0.0 and 1.0.
    """
    
    def __init__(self, user_profile: UserProfile):
        """
        Initialize scorer with user profile.
        
        Args:
            user_profile: User's profile information
        """
        self.user = user_profile
        self.logger = logging.getLogger("JobScorer")
    
    def calculate_skills_match(self, job: pd.Series) -> float:
        """Calculate how well job skills match user skills."""
        try:
            job_skills_str = str(job.get('skills_required', ''))
            if not job_skills_str or job_skills_str.lower() == 'nan':
                return 0.0
            
            # Parse job skills
            job_skills = set([
                s.lower().strip() 
                for s in job_skills_str.split(',') 
                if s.strip()
            ])
            
            if not job_skills:
                return 0.0
            
            # Convert user skills to sets
            user_skills = set(self.user.skills)
            primary_skills = set(self.user.primary_skills)
            
            # Calculate matches
            primary_matches = job_skills & primary_skills
            other_matches = (job_skills & user_skills) - primary_matches
            
            # Score calculation
            score = 0.0
            score += len(primary_matches) * 1.0
            score += len(other_matches) * 0.5
            
            # Normalize by job requirements (max score = number of job skills)
            max_score = len(job_skills)
            normalized_score = min(score / max_score, 1.0) if max_score > 0 else 0.0
            
            return normalized_score
            
        except Exception as e:
            self.logger.error(f"Error calculating skills match: {e}")
            return 0.0
    
    def calculate_experience_match(self, job: pd.Series) -> float:
        """Calculate how well user's experience matches job requirements."""
        try:
            exp_min = float(job.get('experience_min_years', 0) or 0)
            exp_max = float(job.get('experience_max_years', 0) or 0)
            user_exp = self.user.experience_years
            
            # Fresher positions
            if exp_max == 0:
                return 1.0 if user_exp <= 2 else 0.7
            
            # User experience within range
            if exp_min <= user_exp <= exp_max:
                return 1.0
            
            # Slightly under-qualified (within 1 year)
            if user_exp < exp_min and (exp_min - user_exp) <= 1:
                return 0.7
            
            # Over-qualified but acceptable (within 2 years)
            if user_exp > exp_max and (user_exp - exp_max) <= 2:
                return 0.8
            
            # Far from requirements
            if user_exp < exp_min:
                gap = exp_min - user_exp
                return max(0.3, 1.0 - (gap * 0.2))
            else:
                gap = user_exp - exp_max
                return max(0.5, 1.0 - (gap * 0.15))
                
        except Exception as e:
            self.logger.error(f"Error calculating experience match: {e}")
            return 0.5
    
    def calculate_education_match(self, job: pd.Series) -> float:
        """Calculate education requirement match."""
        try:
            job_education_str = str(job.get('education_required', ''))
            if not job_education_str or job_education_str.lower() == 'nan':
                return 0.8  # No requirement = acceptable
            
            # Parse job education requirements
            job_degrees = set([
                deg.upper().strip() 
                for deg in job_education_str.split(',') 
                if deg.strip()
            ])
            
            if not job_degrees:
                return 0.8
            
            # Check if user's education matches
            user_degrees = set(self.user.education)
            
            if job_degrees & user_degrees:
                return 1.0  # Perfect match
            
            # Check for equivalent degrees
            equivalents = {
                'B.TECH': {'B.E', 'BTECH', 'BE'},
                'M.TECH': {'M.E', 'MTECH', 'ME'},
                'BCA': {'B.SC', 'BSC'},
                'MCA': {'M.SC', 'MSC'}
            }
            
            for user_deg in user_degrees:
                for job_deg in job_degrees:
                    if job_deg in equivalents.get(user_deg, set()):
                        return 0.9  # Equivalent degree
            
            return 0.3  # No match
            
        except Exception as e:
            self.logger.error(f"Error calculating education match: {e}")
            return 0.5
    
    def calculate_location_match(self, job: pd.Series) -> float:
        """Calculate location preference match."""
        try:
            job_location = str(job.get('location_city', '')).strip().title()
            work_mode = str(job.get('work_mode', 'On-site')).strip()
            
            # Deal-breaker check
            if job_location in self.user.deal_breaker_locations:
                return 0.0
            
            # Remote work preference
            if work_mode == 'Remote' and self.user.preferred_work_mode in ['Remote', 'Hybrid', 'Any']:
                return 1.0
            
            # Check preferred locations
            if job_location in self.user.preferred_locations:
                return 1.0
            
            # Hybrid acceptable
            if work_mode == 'Hybrid' and self.user.preferred_work_mode in ['Hybrid', 'Any']:
                return 0.8
            
            # No preference specified
            if not self.user.preferred_locations:
                return 0.6
            
            return 0.4  # Not in preferred locations
            
        except Exception as e:
            self.logger.error(f"Error calculating location match: {e}")
            return 0.5
    
    def calculate_completeness_score(self, job: pd.Series) -> float:
        """Use the job's existing completeness score."""
        try:
            return float(job.get('completeness_score', 0.5))
        except:
            return 0.5
    
    def calculate_salary_competitiveness(self, job: pd.Series) -> float:
        """Calculate how competitive the salary is compared to expectations."""
        try:
            job_salary = float(job.get('salary_max', 0) or 0)
            
            # No salary information
            if job_salary == 0:
                return 0.5  # Neutral score
            
            # No expectations set
            if self.user.max_expected_salary == 0:
                return 0.7  # Default positive
            
            # Calculate relative to expectations
            expected_min = self.user.min_expected_salary
            expected_max = self.user.max_expected_salary
            
            # Above expectations
            if job_salary >= expected_max:
                return 1.0
            
            # Within range
            if expected_min <= job_salary < expected_max:
                # Linear interpolation within range
                range_size = expected_max - expected_min
                position = job_salary - expected_min
                return 0.7 + (0.3 * (position / range_size))
            
            # Below minimum but close
            if job_salary >= expected_min * 0.8:
                return 0.6
            
            # Below expectations
            return 0.4
            
        except Exception as e:
            self.logger.error(f"Error calculating salary competitiveness: {e}")
            return 0.5
    
    def calculate_company_reputation(self, job: pd.Series) -> float:
        """Calculate company reputation score."""
        try:
            company_name = str(job.get('company_name', '')).strip()
            
            if not company_name or company_name.lower() == 'nan':
                return 0.5
            
            # Exact match with preferred companies
            if company_name in self.user.preferred_companies:
                return 1.0
            
            # Partial match (case-insensitive)
            company_lower = company_name.lower()
            for preferred in self.user.preferred_companies:
                if preferred.lower() in company_lower or company_lower in preferred.lower():
                    return 0.9
            
            # Well-known companies (generic list)
            top_companies = {
                'google', 'microsoft', 'amazon', 'apple', 'meta', 'facebook',
                'netflix', 'adobe', 'oracle', 'ibm', 'intel', 'nvidia',
                'tcs', 'infosys', 'wipro', 'hcl', 'cognizant', 'accenture'
            }
            
            if any(top_comp in company_lower for top_comp in top_companies):
                return 0.8
            
            # Default score for unknown companies
            return 0.6
            
        except Exception as e:
            self.logger.error(f"Error calculating company reputation: {e}")
            return 0.5
    
    def calculate_deadline_urgency(self, job: pd.Series) -> float:
        """Calculate urgency based on application deadline."""
        try:
            deadline_str = str(job.get('application_deadline', ''))
            
            if not deadline_str or deadline_str.lower() in ['nan', 'none']:
                return 0.5  # No deadline = medium priority
            
            try:
                deadline = datetime.strptime(deadline_str, '%Y-%m-%d')
                days_remaining = (deadline - datetime.now()).days
                
                if days_remaining < 0:
                    return 0.0  # Expired
                elif days_remaining <= 7:
                    return 1.0  # Very urgent
                elif days_remaining <= 15:
                    return 0.8  # High urgency
                elif days_remaining <= 30:
                    return 0.5  # Medium urgency
                else:
                    return 0.3  # Low urgency
                    
            except:
                return 0.5
                
        except Exception as e:
            self.logger.error(f"Error calculating deadline urgency: {e}")
            return 0.5
    
    def calculate_posting_freshness(self, job: pd.Series) -> float:
        """Calculate how recent the job posting is."""
        try:
            timestamp_str = str(job.get('extraction_timestamp', ''))
            
            if not timestamp_str or timestamp_str.lower() == 'nan':
                return 0.6  # Unknown = medium score
            
            try:
                # FIX: Handle Timezone offset mismatch (Naive vs Aware)
                # Convert both to Naive or Aware. Here we strip TZ for safety.
                posting_time = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
                posting_time_naive = posting_time.replace(tzinfo=None)
                now_naive = datetime.now().replace(tzinfo=None)
                
                days_old = (now_naive - posting_time_naive).days
                
                if days_old <= 3:
                    return 1.0  # Very fresh
                elif days_old <= 7:
                    return 0.8  # Fresh
                elif days_old <= 14:
                    return 0.6  # Recent
                elif days_old <= 30:
                    return 0.4  # Older
                else:
                    return 0.2  # Old
                    
            except:
                return 0.6
                
        except Exception as e:
            self.logger.error(f"Error calculating posting freshness: {e}")
            return 0.5
    
    def calculate_preference_bonus(self, job: pd.Series) -> float:
        """Calculate bonus score based on multiple preference matches."""
        try:
            bonus = 0.0
            
            # Preferred company bonus
            company_name = str(job.get('company_name', ''))
            if company_name in self.user.preferred_companies:
                bonus += 0.4
            
            # Preferred location bonus
            location = str(job.get('location_city', ''))
            if location in self.user.preferred_locations:
                bonus += 0.3
            
            # Work mode match bonus
            work_mode = str(job.get('work_mode', ''))
            if work_mode == self.user.preferred_work_mode or self.user.preferred_work_mode == 'Any':
                bonus += 0.3
            
            return min(bonus, 1.0)
            
        except Exception as e:
            self.logger.error(f"Error calculating preference bonus: {e}")
            return 0.0


# ===================================================================
# SMART PRIORITIZATION ENGINE
# ===================================================================

class SmartPrioritizationEngine:
    """
    Main prioritization engine that orchestrates scoring and ranking.
    Combines all scoring components with configurable weights.
    """
    
    def __init__(
        self,
        user_profile: UserProfile,
        weights: Optional[PrioritizationWeights] = None
    ):
        self.user_profile = user_profile
        self.weights = weights or PrioritizationWeights.get_default()
        self.scorer = JobScorer(user_profile)
        self.logger = logging.getLogger("PrioritizationEngine")
        
        self.logger.info("="*70)
        self.logger.info("üéØ SMART PRIORITIZATION ENGINE INITIALIZED")
        self.logger.info("="*70)
        self.logger.info(f"User: {user_profile.name} ({user_profile.user_id})")
        self.logger.info(f"Skills: {', '.join(user_profile.primary_skills[:5])}")
        self.logger.info(f"Experience: {user_profile.experience_years} years")
        self.logger.info("="*70 + "\n")
    
    def calculate_job_priority(self, job: pd.Series) -> Dict[str, float]:
        """Calculate comprehensive priority score for a job."""
        try:
            # Calculate all scoring components
            scores = {
                'skills_match': self.scorer.calculate_skills_match(job),
                'experience_match': self.scorer.calculate_experience_match(job),
                'education_match': self.scorer.calculate_education_match(job),
                'location_match': self.scorer.calculate_location_match(job),
                'completeness': self.scorer.calculate_completeness_score(job),
                'salary_competitiveness': self.scorer.calculate_salary_competitiveness(job),
                'company_reputation': self.scorer.calculate_company_reputation(job),
                'deadline_urgency': self.scorer.calculate_deadline_urgency(job),
                'posting_freshness': self.scorer.calculate_posting_freshness(job),
                'preference_bonus': self.scorer.calculate_preference_bonus(job)
            }
            
            # Calculate weighted total (0-1 scale)
            final_score = (
                scores['skills_match'] * self.weights.skills_match_weight +
                scores['experience_match'] * self.weights.experience_match_weight +
                scores['education_match'] * self.weights.education_match_weight +
                scores['location_match'] * self.weights.location_match_weight +
                scores['completeness'] * self.weights.completeness_weight +
                scores['salary_competitiveness'] * self.weights.salary_competitiveness_weight +
                scores['company_reputation'] * self.weights.company_reputation_weight +
                scores['deadline_urgency'] * self.weights.deadline_urgency_weight +
                scores['posting_freshness'] * self.weights.posting_freshness_weight +
                scores['preference_bonus'] * self.weights.preference_bonus_weight
            )
            
            # Apply must-have filters (hard constraints)
            if not self._check_must_have_requirements(job):
                final_score *= 0.3  # Severe penalty for missing must-haves
            
            # Scale to 0-100
            scores['final_priority_score'] = final_score * 100
            scores['priority_tier'] = self._get_priority_tier(scores['final_priority_score'])
            
            return scores
            
        except Exception as e:
            self.logger.error(f"Error calculating job priority: {e}")
            return {'final_priority_score': 0.0, 'priority_tier': 'Low'}
    
    def _check_must_have_requirements(self, job: pd.Series) -> bool:
        """Check if job meets must-have requirements."""
        try:
            # Check must-have skills
            if self.user_profile.must_have_skills:
                job_skills_str = str(job.get('skills_required', '')).lower()
                for must_have_skill in self.user_profile.must_have_skills:
                    if must_have_skill.lower() not in job_skills_str:
                        return False
            return True
            
        except Exception as e:
            self.logger.error(f"Error checking must-have requirements: {e}")
            return True
    
    def _get_priority_tier(self, score: float) -> str:
        """Convert numerical score to priority tier."""
        if score >= 80:
            return "üî• Must Apply"
        elif score >= 65:
            return "‚≠ê High Priority"
        elif score >= 50:
            return "‚úì Medium Priority"
        elif score >= 35:
            return "‚óã Low Priority"
        else:
            return "‚àí Not Recommended"
    
    def prioritize_jobs(
        self,
        jobs_df: pd.DataFrame,
        save_output: bool = True,
        output_path: str = "prioritized_jobs.csv"
    ) -> pd.DataFrame:
        """Prioritize all jobs in the dataset."""
        self.logger.info(f"üîÑ Prioritizing {len(jobs_df)} job postings...")
        
        # Calculate priority for each job
        priority_data = []
        
        for idx, job in jobs_df.iterrows():
            try:
                scores = self.calculate_job_priority(job)
                priority_data.append(scores)
                
                # Log progress every 50 jobs
                if (idx + 1) % 50 == 0:
                    self.logger.info(f"   Processed {idx+1}/{len(jobs_df)} jobs")
                    
            except Exception as e:
                self.logger.error(f"Error processing job {idx}: {e}")
                priority_data.append({'final_priority_score': 0.0, 'priority_tier': 'Low'})
        
        # Add priority scores to DataFrame
        priority_df = pd.DataFrame(priority_data)
        result_df = pd.concat([jobs_df.reset_index(drop=True), priority_df], axis=1)
        
        # Sort by priority score
        result_df = result_df.sort_values('final_priority_score', ascending=False)
        result_df = result_df.reset_index(drop=True)
        
        # Generate statistics
        self._generate_priority_statistics(result_df)
        
        # Save output
        if save_output:
            try:
                result_df.to_csv(output_path, index=False)
                self.logger.info(f"üíæ Saved prioritized jobs to: {output_path}\n")
            except Exception as e:
                self.logger.error(f"Error saving output: {e}")
        
        return result_df
    
    def _generate_priority_statistics(self, df: pd.DataFrame):
        """Generate and log priority statistics."""
        self.logger.info(f"\n{'='*70}")
        self.logger.info(f"üìä PRIORITIZATION STATISTICS")
        self.logger.info(f"{'='*70}\n")
        
        # Overall statistics
        self.logger.info(f"üìà OVERALL STATISTICS:")
        self.logger.info(f"   Total Jobs: {len(df)}")
        self.logger.info(f"   Average Priority Score: {df['final_priority_score'].mean():.1f}/100")
        self.logger.info(f"   Median Priority Score: {df['final_priority_score'].median():.1f}/100")
        self.logger.info(f"   Highest Score: {df['final_priority_score'].max():.1f}/100")
        self.logger.info(f"   Lowest Score: {df['final_priority_score'].min():.1f}/100")
        
        # Priority tier distribution
        self.logger.info(f"\nüéØ PRIORITY TIER DISTRIBUTION:")
        tier_counts = df['priority_tier'].value_counts()
        for tier, count in tier_counts.items():
            percentage = (count / len(df)) * 100
            self.logger.info(f"   {tier}: {count} jobs ({percentage:.1f}%)")
        
        # Top scoring components
        self.logger.info(f"\n‚≠ê AVERAGE COMPONENT SCORES:")
        components = [
            ('Skills Match', 'skills_match'),
            ('Experience Match', 'experience_match'),
            ('Education Match', 'education_match'),
            ('Location Match', 'location_match'),
            ('Salary Competitiveness', 'salary_competitiveness'),
            ('Company Reputation', 'company_reputation')
        ]
        
        for name, col in components:
            if col in df.columns:
                avg_score = df[col].mean() * 100
                self.logger.info(f"   {name}: {avg_score:.1f}%")
        
        # Top 10 jobs
        self.logger.info(f"\nüî• TOP 10 RECOMMENDED JOBS:")
        self.logger.info(f"{'‚îÄ'*70}")
        
        for idx, (_, job) in enumerate(df.head(10).iterrows(), 1):
            self.logger.info(
                f"\n#{idx} | Score: {job['final_priority_score']:.1f}/100 | "
                f"{job['priority_tier']}"
            )
            self.logger.info(f"   Position: {job['position_title']}")
            self.logger.info(f"   Company: {job.get('company_name', 'Unknown')}")
            self.logger.info(f"   Location : {job.get('location_city', 'Unknown')} ({job.get('work_mode', '')})")
            skills_preview = str(job.get('skills_required', ''))[:50]
            self.logger.info(f"   Skills: {skills_preview}...")
        
            if job.get('salary_max', 0) > 0:
                salary_lpa = job['salary_max'] / 100000
                self.logger.info(f"   Salary: ‚Çπ{salary_lpa:.1f} LPA")
    
        self.logger.info(f"\n{'='*70}\n")

    def get_recommended_jobs(
        self,
        df: pd.DataFrame,
        min_score: float = 65.0,
        max_results: int = 20
    ) -> pd.DataFrame:
        """Get top recommended jobs above threshold."""
        filtered = df[df['final_priority_score'] >= min_score]
        return filtered.head(max_results)
    
# ===================================================================
# USER CUSTOMIZATION INTERFACE
# ===================================================================
class UserCustomization:
    """
    Allows users to customize prioritization weights and preferences.
    Provides methods for adjusting scoring parameters.
    """
    def __init__(self, engine: SmartPrioritizationEngine):
        self.engine = engine
        self.logger = logging.getLogger("UserCustomization")
        self.original_weights = PrioritizationWeights.get_default()

    def adjust_weight(
        self,
        component: str,
        new_weight: float,
        redistribute: bool = True
    ):
        if not hasattr(self.engine.weights, component):
            self.logger.error(f"Unknown component: {component}")
            return
        
        if not (0.0 <= new_weight <= 1.0):
            self.logger.error(f"Weight must be between 0.0 and 1.0")
            return
        
        old_weight = getattr(self.engine.weights, component)
        setattr(self.engine.weights, component, new_weight)
        
        self.logger.info(
            f"‚úì Adjusted {component}: {old_weight:.2f} ‚Üí {new_weight:.2f}"
        )
        
        if redistribute:
            self._redistribute_weights(component)

    def _redistribute_weights(self, changed_component: str):
        weight_attrs = [
            attr for attr in dir(self.engine.weights)
            if attr.endswith('_weight') and not attr.startswith('_')
        ]
        
        total = sum(getattr(self.engine.weights, attr) for attr in weight_attrs)
        
        if not (0.95 <= total <= 1.05):
            other_attrs = [attr for attr in weight_attrs if attr != changed_component]
            other_total = sum(getattr(self.engine.weights, attr) for attr in other_attrs)
            
            if other_total > 0:
                scale_factor = (1.0 - getattr(self.engine.weights, changed_component)) / other_total
                for attr in other_attrs:
                    current = getattr(self.engine.weights, attr)
                    setattr(self.engine.weights, attr, current * scale_factor)

    def boost_component(self, component: str, multiplier: float = 1.5):
        if not hasattr(self.engine.weights, component):
            self.logger.error(f"Unknown component: {component}")
            return
        
        current = getattr(self.engine.weights, component)
        new_weight = min(current * multiplier, 0.5)  # Cap at 0.5
        self.adjust_weight(component, new_weight)

    def penalize_component(self, component: str, multiplier: float = 0.5):
        if not hasattr(self.engine.weights, component):
            self.logger.error(f"Unknown component: {component}")
            return
        
        current = getattr(self.engine.weights, component)
        new_weight = max(current * multiplier, 0.01)  # Min at 0.01
        self.adjust_weight(component, new_weight)

    def reset_weights(self):
        self.engine.weights = PrioritizationWeights.get_default()
        self.logger.info("‚úì Reset all weights to default values")

    def save_preferences(self, filepath: str = "user_preferences.json"):
        try:
            preferences = {
                'weights': self.engine.weights.to_dict(),
                'user_profile': asdict(self.engine.user_profile),
                'saved_at': datetime.now().isoformat()
            }
            
            with open(filepath, 'w') as f:
                json.dump(preferences, f, indent=2)
            
            self.logger.info(f"üíæ Saved preferences to: {filepath}")
            
        except Exception as e:
                self.logger.error(f"Error saving preferences: {e}")

    def load_preferences(self, filepath: str = "user_preferences.json"):
        try:
            with open(filepath, 'r') as f:
                preferences = json.load(f)
            
            weights_dict = preferences.get('weights', {})
            self.engine.weights = PrioritizationWeights(**weights_dict)
            
            self.logger.info(f"‚úì Loaded preferences from: {filepath}")
            
        except Exception as e:
            self.logger.error(f"Error loading preferences: {e}")

# ===================================================================
# MAIN EXECUTION PIPELINE
# ===================================================================
def main(
    jobs_csv: str = "structured_job_postings.csv",
    user_profile: Optional[UserProfile] = None,
    custom_weights: Optional[PrioritizationWeights] = None
    ) -> Tuple[pd.DataFrame, SmartPrioritizationEngine]:
    """
    Main execution function for job prioritization.
    """

    logger.info("\n" + "="*70)
    logger.info("üöÄ STARTING JOB PRIORITIZATION PIPELINE")
    logger.info("="*70 + "\n")

    try:
        # Load jobs data
        logger.info(f"üìÇ Loading jobs from: {jobs_csv}")
        # Verify file exists
        if not os.path.exists(jobs_csv):
            raise FileNotFoundError(f"File not found at: {jobs_csv}")
            
        jobs_df = pd.read_csv(jobs_csv)
        logger.info(f"‚úÖ Loaded {len(jobs_df)} job postings\n")
        
        # Create or use provided user profile
        if user_profile is None:
            logger.info("üë§ Creating sample user profile...")
            user_profile = UserProfile.create_sample_profile()
        
        # Initialize prioritization engine
        engine = SmartPrioritizationEngine(
            user_profile=user_profile,
            weights=custom_weights
        )
        
        # Prioritize jobs
        prioritized_df = engine.prioritize_jobs(
            jobs_df=jobs_df,
            save_output=True,
            output_path="prioritized_jobs.csv"
        )
        
        # Get top recommendations
        logger.info("üéØ Generating recommendations...")
        recommendations = engine.get_recommended_jobs(
            df=prioritized_df,
            min_score=65.0,
            max_results=20
        )
        
        logger.info(f"‚úÖ Found {len(recommendations)} high-priority recommendations\n")
        
        # Save recommendations separately
        recommendations.to_csv("top_recommendations.csv", index=False)
        logger.info("üíæ Saved top recommendations to: top_recommendations.csv\n")
        
        logger.info("="*70)
        logger.info("‚úÖ PRIORITIZATION PIPELINE COMPLETED")
        logger.info("="*70)
        logger.info("üìÅ Output Files:")
        logger.info("   - prioritized_jobs.csv (all jobs with scores)")
        logger.info("   - top_recommendations.csv (high-priority jobs)")
        logger.info("   - job_prioritization.log (execution log)")
        logger.info("="*70 + "\n")
        
        return prioritized_df, engine
        
    except Exception as e:
        logger.error(f"‚ùå PIPELINE FAILED: {e}")
        raise

# ===================================================================
# ENTRY POINT
# =========================================
if __name__ == "__main__":
    # Option 1: Run with default settings
    # prioritized_df, engine = main()
    pass
    # Actual execution moved to cell 2 as per user notebook structure

In [None]:
# Create user profile
user = UserProfile.create_sample_profile()

path = r"D:\Projects By Month\November 2025\Placement Mail Analysis System\.venv\Phase_scripts\Phase 3\structured_job_postings.csv"

# Run prioritization
prioritized_df, engine = main(
    jobs_csv=path,
    user_profile=user
)