In [12]:
import os 
from dotenv import load_dotenv
import pdfplumber
import docx2txt

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser


In [3]:
# Load .env for API key
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

In [4]:
if not api_key:
    raise ValueError("❌ GOOGLE_API_KEY not found in .env file")

In [42]:
# Initialize Gemini 2.5 Flash LLM with safe optimizations (maintains consistency)
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2,  # Restore original temperature for consistency
    api_key=api_key,
    max_output_tokens=4096,  # Set explicit limit for faster response
    max_retries=2,  # Keep reasonable retries for reliability
)

E0000 00:00:1759853245.293862    9971 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [51]:

def extract_text_from_file(file_path: str, file_type: str) -> str:
    """Extract text from PDF or DOCX resume."""
    if file_type.lower() == "pdf":
        text_parts = []
        with pdfplumber.open(file_path) as pdf:
            max_pages = min(2, len(pdf.pages))
            for i in range(max_pages):
                text_parts.append(pdf.pages[i].extract_text() or "")
        text = "\n".join(text_parts)
    elif file_type.lower() == "docx":
        text = docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type")

    # Optional cleaning
    text = text.replace('\n', ' ').replace('\t', ' ').strip()
    return text


In [52]:
# Combined Unified Resume Extraction with JSON Output (Fixed Parser Issue)
import json

# Create a LangChain prompt with direct JSON output
prompt = ChatPromptTemplate.from_template("""
You are an expert Resume Intelligence Agent that extracts structured data and evaluates resumes for ATS compatibility.

Analyze the following resume text and return ONLY a valid JSON object with these exact keys:

{{
  "name": "",
  "location": "",
  "summary": "",
  "skills": [],
  "extra_skills": [],
  "work_experience": [],
  "projects": [],
  "certifications": [],
  "education": [],
  "ats_feedback": {{
    "score": 0,
    "summary": "",
    "strengths": [],
    "improvements": []
  }}
}}

CRITICAL EXTRACTION RULES FOR ALL SECTIONS:

1. **NAME**: Extract the full name exactly as written, using the most prominent name (usually at the top).

2. **LOCATION**: Extract specific city and country from contact info, address, or personal details. Format as "City, Country" (e.g., "Chennai, India", "Bangalore, India"). If no city is specified, use " Country".

3. **SUMMARY**: Look for sections titled "Summary", "Objective", "Profile", "About Me", "Career Summary". Extract complete professional summary.

4. **SKILLS**: Extract skills ONLY from dedicated "Skills", "Technical Skills", "Core Skills", "Programming Languages", or similar sections:
   - ONLY include skills explicitly listed in a dedicated skills section
   - Programming languages, frameworks, tools, technologies mentioned in skills section
   - Return as array of individual skills from the skills section only

5. **EXTRA_SKILLS**: Extract additional skills mentioned in other contexts:
   - Skills mentioned in work experience descriptions
   - Technologies used in projects
   - Skills mentioned in certifications or education
   - Any other skills not in the main skills section
   - Return as array of individual skills from non-skills sections

6. **WORK_EXPERIENCE**: Extract each position with:
   - Job title, company, duration, location
   - Key responsibilities and achievements
   - Format as structured objects with consistent fields

7. **PROJECTS**: Extract personal/academic projects with:
   - Project name, duration, technologies used
   - Brief description and key features
   - Any notable achievements or results

8. **CERTIFICATIONS**: Extract all certifications with:
   - Certification name, issuing organization, year
   - Include online courses, professional certifications

9. **EDUCATION**: Extract educational background with:
   - Degree, institution, graduation year
   - Relevant coursework or achievements

10. **ATS_FEEDBACK**: Provide objective evaluation:
    - score: 0-100 based on ATS compatibility
    - summary: Brief assessment
    - strengths: Positive aspects
    - improvements: Areas for enhancement 

Guidelines:
- Detect section names dynamically (e.g., "Profile", "About Me", "Objective" → summary).
- CRITICAL: Skills extraction must be source-aware:
  * "skills" array: ONLY from dedicated skills sections (Skills, Technical Skills, Core Skills, Programming Languages, etc.)
  * "extra_skills" array: Skills mentioned in work experience, projects, certifications, education, or other contexts
- Extract job/project details separately.
- Be consistent and produce clean JSON only.
- Prioritize accuracy over completeness.
- IMPORTANT: Return ONLY the JSON object, no additional text or explanations.

Resume Text:
{resume_text}
""")

# Build the chain with StrOutputParser for better JSON handling
resume_parser_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    output_parser=StrOutputParser()
)

In [55]:
# Step 1: Extract text
resume_path = "app/resumes/SaidivyaResume.pdf"
text = extract_text_from_file(resume_path, "pdf")

# Step 2: Parse with Gemini
try:
    raw_output = resume_parser_chain.run({"resume_text": text})
    # print("Raw LLM Output:")
    # print(raw_output)
    # print("\n" + "="*50 + "\n")
    
    # Parse JSON from the output
    structured_output = json.loads(raw_output)
    print("Parsed JSON Output:")
    print(json.dumps(structured_output, indent=2))
    
except json.JSONDecodeError as e:
    print(f"JSON Parsing Error: {e}")
    print("Raw output that failed to parse:")
    print(raw_output)
except Exception as e:
    print(f"Error: {e}")
    print("Raw output:")
    print(raw_output)


Cannot set gray non-stroke color because /'P7' is an invalid float value


JSON Parsing Error: Expecting value: line 1 column 1 (char 0)
Raw output that failed to parse:
```json
{
  "name": "KODIPAKA SAIDIVYA",
  "location": "India",
  "summary": "I am a dedicated student and avid continuous learner with a strong proficiency in electrical and electronics engineering. I have gained valuable hands-on experience during a 6-month internship at a TS Transco substation. Currently, I am expanding my expertise in data science, SQL, and Python to keep pace with emerging technological advancements.",
  "skills": [
    "Python",
    "C",
    "SQL",
    "Data Structures",
    "Data Science",
    "MS Excel",
    "AutoCAD",
    "CMOS VLSI Design",
    "Circuit Designing",
    "Electronics Enthusiast",
    "Electrical Proficiency",
    "Quick Learning",
    "problem solving",
    "Adaptability",
    "Leadership qualities"
  ],
  "extra_skills": [
    "MATLAB",
    "ML",
    "Simulink",
    "advanced grid technologies",
    "smart grid systems",
    "renewable energy sources

In [None]:
# LinkedIn API Integration for Agentic Job Scraping and Matching
import requests
import json
from typing import List, Dict, Optional
from datetime import datetime, timedelta

class LinkedInJobAgent:
    """
    Agentic LinkedIn Job Scraping and Matching Agent
    Uses LinkedIn Marketing API for job data and AI for intelligent matching
    """
    
    def __init__(self, access_token: str, api_version: str = "202312"):
        self.access_token = access_token
        self.api_version = api_version
        self.base_url = f"https://api.linkedin.com/v{api_version}"
        self.headers = {
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json",
            "X-Restli-Protocol-Version": "2.0.0"
        }
    
    def search_jobs_agentically(self, 
                              keywords: List[str],
                              location: str,
                              experience_level: str,
                              job_type: str,
                              max_results: int = 50) -> List[Dict]:
        """
        Agentically search for jobs using LinkedIn API with intelligent filtering
        """
        try:
            # Build intelligent search query
            search_params = self._build_intelligent_search_params(
                keywords, location, experience_level, job_type
            )
            
            # Make API request
            response = requests.get(
                f"{self.base_url}/jobSearch",
                headers=self.headers,
                params=search_params
            )
            
            if response.status_code == 200:
                jobs_data = response.json()
                return self._process_job_results(jobs_data, max_results)
            else:
                print(f"API Error: {response.status_code} - {response.text}")
                return []
                
        except Exception as e:
            print(f"Error in job search: {e}")
            return []
    
    def _build_intelligent_search_params(self, 
                                       keywords: List[str],
                                       location: str,
                                       experience_level: str,
                                       job_type: str) -> Dict:
        """
        Build intelligent search parameters based on job requirements
        """
        # Convert keywords to LinkedIn search format
        keyword_query = " OR ".join([f'"{kw}"' for kw in keywords])
        
        # Map experience levels to LinkedIn codes
        experience_mapping = {
            "entry": "1",
            "associate": "2", 
            "mid": "3",
            "senior": "4",
            "executive": "5"
        }
        
        # Map job types to LinkedIn codes
        job_type_mapping = {
            "full-time": "F",
            "part-time": "P",
            "contract": "C",
            "internship": "I"
        }
        
        return {
            "keywords": keyword_query,
            "locationName": location,
            "experienceLevel": experience_mapping.get(experience_level, "3"),
            "jobType": job_type_mapping.get(job_type, "F"),
            "sortBy": "DD",  # Date descending
            "count": 50
        }
    
    def _process_job_results(self, jobs_data: Dict, max_results: int) -> List[Dict]:
        """
        Process and enrich job results with additional data
        """
        processed_jobs = []
        
        for job in jobs_data.get("elements", [])[:max_results]:
            # Extract basic job info
            job_info = {
                "id": job.get("id"),
                "title": job.get("title"),
                "company": job.get("companyName"),
                "location": job.get("location"),
                "description": job.get("description"),
                "posted_date": job.get("postedDate"),
                "job_type": job.get("jobType"),
                "experience_level": job.get("experienceLevel"),
                "salary": job.get("salaryInfo"),
                "remote_allowed": job.get("remoteAllowed"),
                "skills": job.get("skills", []),
                "linkedin_url": f"https://linkedin.com/jobs/view/{job.get('id')}"
            }
            
            # Enrich with company data
            if job.get("companyId"):
                company_data = self._get_company_info(job["companyId"])
                job_info.update(company_data)
            
            processed_jobs.append(job_info)
        
        return processed_jobs
    
    def _get_company_info(self, company_id: str) -> Dict:
        """
        Get additional company information
        """
        try:
            response = requests.get(
                f"{self.base_url}/companies/{company_id}",
                headers=self.headers
            )
            
            if response.status_code == 200:
                company_data = response.json()
                return {
                    "company_size": company_data.get("companySize"),
                    "industry": company_data.get("industry"),
                    "company_description": company_data.get("description"),
                    "company_website": company_data.get("websiteUrl")
                }
        except Exception as e:
            print(f"Error getting company info: {e}")
        
        return {}
    
    def match_jobs_with_resume(self, 
                             jobs: List[Dict], 
                             resume_data: Dict) -> List[Dict]:
        """
        Agentically match jobs with resume using AI-powered analysis
        """
        matched_jobs = []
        
        for job in jobs:
            # Calculate match score using multiple factors
            match_score = self._calculate_match_score(job, resume_data)
            
            if match_score > 0.3:  # Minimum threshold
                job["match_score"] = match_score
                job["match_reasons"] = self._get_match_reasons(job, resume_data)
                matched_jobs.append(job)
        
        # Sort by match score
        return sorted(matched_jobs, key=lambda x: x["match_score"], reverse=True)
    
    def _calculate_match_score(self, job: Dict, resume_data: Dict) -> float:
        """
        Calculate match score between job and resume
        """
        score = 0.0
        
        # Skills match (40% weight)
        job_skills = set([skill.lower() for skill in job.get("skills", [])])
        resume_skills = set([skill.lower() for skill in resume_data.get("skills", [])])
        extra_skills = set([skill.lower() for skill in resume_data.get("extra_skills", [])])
        all_resume_skills = resume_skills.union(extra_skills)
        
        if job_skills and all_resume_skills:
            skills_match = len(job_skills.intersection(all_resume_skills)) / len(job_skills)
            score += skills_match * 0.4
        
        # Experience level match (20% weight)
        if job.get("experience_level") and resume_data.get("work_experience"):
            exp_match = self._match_experience_level(job["experience_level"], resume_data["work_experience"])
            score += exp_match * 0.2
        
        # Location match (15% weight)
        if job.get("location") and resume_data.get("location"):
            location_match = self._match_location(job["location"], resume_data["location"])
            score += location_match * 0.15
        
        # Job type match (10% weight)
        if job.get("job_type"):
            job_type_match = self._match_job_type(job["job_type"], resume_data)
            score += job_type_match * 0.1
        
        # Education match (15% weight)
        if resume_data.get("education"):
            education_match = self._match_education(job, resume_data["education"])
            score += education_match * 0.15
        
        return min(score, 1.0)  # Cap at 1.0
    
    def _match_experience_level(self, job_level: str, work_experience: List[Dict]) -> float:
        """
        Match experience level between job and resume
        """
        # Count years of experience
        total_years = 0
        for exp in work_experience:
            if exp.get("duration"):
                # Parse duration and calculate years (simplified)
                total_years += self._parse_duration_to_years(exp["duration"])
        
        # Map job levels to expected years
        level_mapping = {
            "entry": (0, 2),
            "associate": (1, 4),
            "mid": (3, 7),
            "senior": (6, 12),
            "executive": (10, 20)
        }
        
        expected_range = level_mapping.get(job_level, (0, 5))
        if expected_range[0] <= total_years <= expected_range[1]:
            return 1.0
        elif total_years < expected_range[0]:
            return 0.5  # Underqualified but might be acceptable
        else:
            return 0.7  # Overqualified but still relevant
        
        return 0.0
    
    def _match_location(self, job_location: str, resume_location: str) -> float:
        """
        Match location compatibility
        """
        if not job_location or not resume_location:
            return 0.5
        
        job_location = job_location.lower()
        resume_location = resume_location.lower()
        
        # Exact match
        if job_location == resume_location:
            return 1.0
        
        # Same city
        job_city = job_location.split(",")[0].strip()
        resume_city = resume_location.split(",")[0].strip()
        if job_city == resume_city:
            return 0.9
        
        # Same country
        job_country = job_location.split(",")[-1].strip()
        resume_country = resume_location.split(",")[-1].strip()
        if job_country == resume_country:
            return 0.6
        
        # Remote work
        if "remote" in job_location:
            return 0.8
        
        return 0.2
    
    def _match_job_type(self, job_type: str, resume_data: Dict) -> float:
        """
        Match job type preferences
        """
        # This could be enhanced with user preferences
        # For now, return neutral score
        return 0.5
    
    def _match_education(self, job: Dict, education: List[Dict]) -> float:
        """
        Match education requirements
        """
        # This could be enhanced with specific degree requirements
        # For now, return neutral score if education exists
        return 0.7 if education else 0.3
    
    def _parse_duration_to_years(self, duration: str) -> float:
        """
        Parse duration string to years (simplified)
        """
        # This is a simplified parser - could be enhanced
        if "year" in duration.lower():
            return 1.0
        elif "month" in duration.lower():
            return 0.1
        return 0.5
    
    def _get_match_reasons(self, job: Dict, resume_data: Dict) -> List[str]:
        """
        Get reasons why job matches resume
        """
        reasons = []
        
        # Skills match reasons
        job_skills = set([skill.lower() for skill in job.get("skills", [])])
        resume_skills = set([skill.lower() for skill in resume_data.get("skills", [])])
        extra_skills = set([skill.lower() for skill in resume_data.get("extra_skills", [])])
        all_resume_skills = resume_skills.union(extra_skills)
        
        matching_skills = job_skills.intersection(all_resume_skills)
        if matching_skills:
            reasons.append(f"Skills match: {', '.join(list(matching_skills)[:3])}")
        
        # Experience match
        if job.get("experience_level"):
            reasons.append(f"Experience level suitable: {job['experience_level']}")
        
        # Location match
        if job.get("location"):
            reasons.append(f"Location compatible: {job['location']}")
        
        return reasons

# Example usage
def main():
    # Initialize the agent
    access_token = "YOUR_LINKEDIN_ACCESS_TOKEN"
    job_agent = LinkedInJobAgent(access_token)
    
    # Search for jobs
    jobs = job_agent.search_jobs_agentically(
        keywords=["python", "machine learning", "data science"],
        location="Bangalore, India",
        experience_level="mid",
        job_type="full-time",
        max_results=20
    )
    
    # Load resume data (from your resume parser)
    resume_data = {
        "name": "Yeswanth Yerra",
        "location": "Chennai, India",
        "skills": ["Python", "Java", "JavaScript", "Spring Boot", "React.js"],
        "extra_skills": ["Machine Learning", "Docker", "AWS", "MongoDB"],
        "work_experience": [
            {
                "job_title": "Full Stack Development Intern",
                "company": "Pantech Prolabs Pvt Ltd",
                "duration": "June 2024 - October 2024"
            }
        ],
        "education": [
            {
                "degree": "Computer Science and Engineering",
                "institution": "Some University"
            }
        ]
    }
    
    # Match jobs with resume
    matched_jobs = job_agent.match_jobs_with_resume(jobs, resume_data)
    
    # Display results
    print(f"Found {len(jobs)} jobs, matched {len(matched_jobs)} with resume")
    for job in matched_jobs[:5]:  # Top 5 matches
        print(f"\n🎯 {job['title']} at {job['company']}")
        print(f"   Match Score: {job['match_score']:.2f}")
        print(f"   Location: {job['location']}")
        print(f"   Reasons: {'; '.join(job['match_reasons'])}")
        print(f"   URL: {job['linkedin_url']}")

if __name__ == "__main__":
    main()


In [None]:
# Integration Example: LinkedIn Job Agent + Resume Parser
def integrate_job_matching_with_resume_parser():
    """
    Example of how to integrate LinkedIn Job Agent with your resume parser
    """
    
    # Step 1: Parse resume (using your existing parser)
    print("📄 Parsing resume...")
    resume_path = "app/resumes/Yeswanth_Yerra_CV.pdf"
    text = extract_text_from_file(resume_path, "pdf")
    
    # Parse with your existing chain
    raw_output = resume_parser_chain.run({"resume_text": text})
    resume_data = json.loads(raw_output)
    
    print(f"✅ Resume parsed for: {resume_data.get('name')}")
    print(f"📍 Location: {resume_data.get('location')}")
    print(f"🛠️  Skills: {len(resume_data.get('skills', []))} main skills")
    print(f"➕ Extra Skills: {len(resume_data.get('extra_skills', []))} additional skills")
    
    # Step 2: Initialize LinkedIn Job Agent
    print("\n🔍 Initializing LinkedIn Job Agent...")
    linkedin_token = "YOUR_LINKEDIN_ACCESS_TOKEN"  # You'll need to get this
    job_agent = LinkedInJobAgent(linkedin_token)
    
    # Step 3: Extract search parameters from resume
    search_keywords = []
    search_keywords.extend(resume_data.get("skills", [])[:5])  # Top 5 main skills
    search_keywords.extend(resume_data.get("extra_skills", [])[:3])  # Top 3 extra skills
    
    # Remove duplicates and convert to lowercase
    search_keywords = list(set([kw.lower() for kw in search_keywords]))[:8]
    
    print(f"🎯 Search keywords: {search_keywords}")
    
    # Step 4: Search for jobs
    print("\n🔍 Searching for jobs...")
    jobs = job_agent.search_jobs_agentically(
        keywords=search_keywords,
        location=resume_data.get("location", "India"),
        experience_level="mid",  # Could be determined from resume
        job_type="full-time",
        max_results=30
    )
    
    print(f"📊 Found {len(jobs)} jobs")
    
    # Step 5: Match jobs with resume
    print("\n🎯 Matching jobs with resume...")
    matched_jobs = job_agent.match_jobs_with_resume(jobs, resume_data)
    
    print(f"✅ {len(matched_jobs)} jobs matched with resume")
    
    # Step 6: Display top matches
    print("\n🏆 TOP JOB MATCHES:")
    print("=" * 60)
    
    for i, job in enumerate(matched_jobs[:5], 1):
        print(f"\n{i}. 🎯 {job['title']} at {job['company']}")
        print(f"   📊 Match Score: {job['match_score']:.2f}/1.0")
        print(f"   📍 Location: {job['location']}")
        print(f"   💼 Type: {job['job_type']}")
        print(f"   🏢 Industry: {job.get('industry', 'N/A')}")
        print(f"   🔗 URL: {job['linkedin_url']}")
        
        print(f"   ✅ Match Reasons:")
        for reason in job['match_reasons']:
            print(f"      • {reason}")
        
        if job.get('salary'):
            print(f"   💰 Salary: {job['salary']}")
    
    return matched_jobs

# LinkedIn API Setup Instructions
def setup_linkedin_api():
    """
    Instructions for setting up LinkedIn API access
    """
    print("""
🔧 LINKEDIN API SETUP INSTRUCTIONS:

1. Create LinkedIn App:
   - Go to https://www.linkedin.com/developers/apps
   - Click "Create app"
   - Fill in app details

2. Request API Access:
   - Go to "Products" tab
   - Request access to "Marketing Developer Platform"
   - Request access to "Talent Solutions" (for job data)

3. Get Access Token:
   - Use OAuth 2.0 flow to get user authorization
   - Exchange authorization code for access token
   - Store token securely

4. API Endpoints Available:
   - /jobSearch - Search for jobs
   - /companies/{id} - Get company details
   - /people/{id} - Get people profiles (if needed)

5. Rate Limits:
   - Marketing API: 500 requests per app per day
   - Talent Solutions: Varies by plan

6. Alternative Approaches:
   - LinkedIn Learning API (for skills)
   - Partner APIs (like Indeed, Glassdoor)
   - Web scraping with proper rate limiting
   
📝 Note: LinkedIn API access requires approval and may have restrictions.
Consider alternative job data sources if LinkedIn API is not available.
""")

# Run the integration example
if __name__ == "__main__":
    print("🚀 LinkedIn Job Matching Integration Demo")
    print("=" * 50)
    
    # Show setup instructions
    setup_linkedin_api()
    
    # Note: Uncomment below to run actual integration
    # (requires valid LinkedIn API token)
    # matched_jobs = integrate_job_matching_with_resume_parser()


In [59]:
# ==============================================================
# 📘 Job Search Agent (LinkedIn/Naukri Scraper + Skill Matching)
# ==============================================================
# Requirements:
# pip install langchain langchain-google-genai sentence-transformers
# pip install requests beautifulsoup4 python-dotenv
# ==============================================================

import os
import re
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse
from typing import List, Dict

from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv

# --------------------------------------------------------------
# ✅ Load API Keys and Environment Variables
# --------------------------------------------------------------
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# --------------------------------------------------------------
# ⚙️ Embedding Model Setup
# --------------------------------------------------------------
# You can replace this with Gemini embedding endpoint later.
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_text_batch(texts: List[str]):
    return embedding_model.encode(texts, convert_to_tensor=True, show_progress_bar=False)

def embed_text(text: str):
    return embedding_model.encode([text], convert_to_tensor=True, show_progress_bar=False)

def cosine_similarity_batch(a, b):
    return util.cos_sim(a, b)

# --------------------------------------------------------------
# 🧹 Utility: Check Recency
# --------------------------------------------------------------
RECENT_DAYS = 30

def is_recent_post(date_str: str) -> bool:
    """
    Checks if a job post date (like '3 days ago' or 'Posted on 05 Aug 2025')
    is within RECENT_DAYS.
    """
    try:
        if not date_str:
            return True
        if "ago" in date_str.lower():
            m = re.search(r"(\d+)\s+days?\s+ago", date_str)
            if m:
                num = int(m.group(1))
                return num <= RECENT_DAYS
        # Parse absolute date format like '05 Aug 2025'
        dt = datetime.strptime(date_str, "%d %b %Y")
        return (datetime.now() - dt).days <= RECENT_DAYS
    except:
        return True

# --------------------------------------------------------------
# 🌐 Fetch Job Links (Google site: search)
# --------------------------------------------------------------
def google_job_links_recent(domain: str, keyword: str, num: int = 5) -> List[str]:
    """
    Fetches recent job links using Google site search.
    Example: site:linkedin.com/jobs Backend Engineer Bangalore
    """
    search_url = f"https://www.google.com/search?q=site:{domain}%20{keyword}"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(search_url, headers=headers, timeout=10)
    soup = BeautifulSoup(resp.text, "html.parser")

    links = []
    for a in soup.select("a"):
        href = a.get("href")
        if href and domain in href:
            m = re.match(r"/url\?q=(https?://[^&]+)", href)
            if m:
                real = m.group(1)
                links.append(real)
            else:
                links.append(href)

    # Deduplicate and limit
    unique = []
    for l in links:
        if l not in unique:
            unique.append(l)
        if len(unique) >= num:
            break
    return unique

# --------------------------------------------------------------
# 🕵️ LinkedIn Scraper
# --------------------------------------------------------------
def scrape_linkedin_job(url: str) -> Dict:
    """
    Scrapes LinkedIn job page to extract job details.
    Note: Works best with accessible job URLs (public pages).
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(resp.text, "html.parser")

    title = soup.find("h1", {"class": re.compile(r"topcard__title", re.IGNORECASE)})
    title = title.get_text().strip() if title else ""

    comp = soup.find("a", {"class": re.compile(r"topcard__org-name-link", re.IGNORECASE)})
    company = comp.get_text().strip() if comp else ""

    loc = soup.find("span", {"class": re.compile(r"topcard__flavor--bullet", re.IGNORECASE)})
    location = loc.get_text().strip() if loc else ""

    date_el = soup.find("span", string=re.compile(r"ago|Posted", re.IGNORECASE))
    posted = date_el.get_text().strip() if date_el else ""

    desc_div = soup.find("div", {"class": re.compile(r"show-more-less-html__markup", re.IGNORECASE)})
    description = desc_div.get_text().strip() if desc_div else ""

    return {
        "url": url,
        "title": title,
        "company": company,
        "location": location,
        "posted": posted,
        "description": description,
    }

# --------------------------------------------------------------
# 🧠 Main Agent Function
# --------------------------------------------------------------
def agent_fetch_and_match(keyword: str, user_profile: str, portal="linkedin.com/jobs", num_jobs=5):
    """
    1️⃣ Searches for recent jobs from LinkedIn/Naukri (via Google site search)
    2️⃣ Scrapes job details
    3️⃣ Filters recent jobs
    4️⃣ Matches with user profile using embeddings
    5️⃣ Returns ranked jobs
    """
    print(f"\n🔍 Searching for recent '{keyword}' jobs from {portal} ...")

    links = google_job_links_recent(portal, keyword, num=num_jobs)
    jobs = []

    for link in links:
        try:
            job = scrape_linkedin_job(link)
            if is_recent_post(job["posted"]):
                jobs.append(job)
            time.sleep(1)  # avoid bot detection
        except Exception as e:
            print("⚠️ Error scraping:", link, "|", e)

    if not jobs:
        print("No jobs found or accessible. Try another keyword.")
        return []

    # Compute embeddings
    texts = [f"{j['title']} {j['description']} {j['location']}" for j in jobs]
    job_vecs = embed_text_batch(texts)
    user_vec = embed_text(user_profile)

    # Compute similarity
    sims = cosine_similarity_batch(user_vec, job_vecs)[0]
    for i, j in enumerate(jobs):
        j["match_score"] = float(sims[i])

    jobs_sorted = sorted(jobs, key=lambda x: x["match_score"], reverse=True)

    print(f"✅ Found {len(jobs_sorted)} relevant jobs.")
    return jobs_sorted

# --------------------------------------------------------------
# 🧪 Example Test Run
# --------------------------------------------------------------
user_profile = "Python, FastAPI, Backend, AWS, SQL, Docker, REST APIs, Microservices"
results = agent_fetch_and_match("Backend Engineer Bangalore", user_profile, portal="linkedin.com/jobs", num_jobs=5)

for i, job in enumerate(results[:5], 1):
    print(f"\n{i}. {job['title']} - {job['company']}")
    print(f"📍 {job['location']} | 🕓 {job['posted']}")
    print(f"💡 Match Score: {job['match_score']:.2f}")
    print(f"🔗 {job['url']}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


🔍 Searching for recent 'Backend Engineer Bangalore' jobs from linkedin.com/jobs ...
⚠️ Error scraping: /search?q=site:linkedin.com/jobs+Backend+Engineer+Bangalore&sca_esv=a1c6a911e61942c7&emsg=SG_REL&sei=yfjkaPqfBd6f0PEPgI28sAk | No connection adapters were found for '/search?q=site:linkedin.com/jobs+Backend+Engineer+Bangalore&sca_esv=a1c6a911e61942c7&emsg=SG_REL&sei=yfjkaPqfBd6f0PEPgI28sAk'
No jobs found or accessible. Try another keyword.
