In [31]:
import os 
from dotenv import load_dotenv
import pdfplumber
import docx2txt

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser


In [32]:
# Load .env for API key
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")


In [33]:
if not api_key:
    raise ValueError("❌ GOOGLE_API_KEY not found in .env file")


In [34]:
# Initialize Gemini 2.5 Flash LLM with safe optimizations (maintains consistency)
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2,  # Restore original temperature for consistency
    api_key=api_key,
    max_output_tokens=4096,  # Set explicit limit for faster response
    max_retries=2,  # Keep reasonable retries for reliability
)


E0000 00:00:1759938676.582822   13497 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [35]:
def extract_text_from_file(file_path: str, file_type: str) -> str:
    """Extract text from PDF or DOCX resume."""
    if file_type.lower() == "pdf":
        text_parts = []
        with pdfplumber.open(file_path) as pdf:
            max_pages = min(2, len(pdf.pages))
            for i in range(max_pages):
                text_parts.append(pdf.pages[i].extract_text() or "")
        text = "\n".join(text_parts)
    elif file_type.lower() == "docx":
        text = docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type")

    # Optional cleaning
    text = text.replace('\n', ' ').replace('\t', ' ').strip()
    return text


In [36]:
# Combined Unified Resume Extraction with JSON Output (Fixed Parser Issue)
import json

# Create a LangChain prompt with direct JSON output
prompt = ChatPromptTemplate.from_template("""
You are an expert Resume Intelligence Agent that extracts structured data and evaluates resumes for ATS compatibility.

Analyze the following resume text and return ONLY a valid JSON object with these exact keys:

{{
  "name": "",
  "location": "",
  "summary": "",
  "skills": [],
  "extra_skills": [],
  "work_experience": [],
  "projects": [],
  "certifications": [],
  "education": [],
  "experience_level": "",
  "recommended_roles": [],
  "ats_feedback": {{
    "score": 0,
    "summary": "",
    "strengths": [],
    "improvements": []
  }}
}}

CRITICAL EXTRACTION RULES FOR ALL SECTIONS:

1. **NAME**: Extract the full name exactly as written, using the most prominent name (usually at the top).

2. **LOCATION**: Extract specific city and country from contact info, address, or personal details. Format as "City, Country" (e.g., "Chennai, India", "Bangalore, India"). If no city is specified, use " Country".

3. **SUMMARY**: Look for sections titled "Summary", "Objective", "Profile", "About Me", "Career Summary". Extract complete professional summary.

4. **SKILLS**: Extract skills ONLY from dedicated "Skills", "Technical Skills", "Core Skills", "Programming Languages", or similar sections:
   - ONLY include skills explicitly listed in a dedicated skills section
   - Programming languages, frameworks, tools, technologies mentioned in skills section
   - Return as array of individual skills from the skills section only

5. **EXTRA_SKILLS**: Extract additional skills mentioned in other contexts:
   - Skills mentioned in work experience descriptions
   - Technologies used in projects
   - Skills mentioned in certifications or education
   - Any other skills not in the main skills section
   - Return as array of individual skills from non-skills sections

6. **WORK_EXPERIENCE**: Extract each position with:
   - Job title, company, duration, location
   - Key responsibilities and achievements
   - Format as structured objects with consistent fields

7. **PROJECTS**: Extract personal/academic projects with:
   - Project name, duration, technologies used
   - Brief description and key features
   - Any notable achievements or results

8. **CERTIFICATIONS**: Extract all certifications with:
   - Certification name, issuing organization, year
   - Include online courses, professional certifications

9. **EDUCATION**: Extract educational background with:
   - Degree, institution, graduation year
   - Relevant coursework or achievements

10. **EXPERIENCE_LEVEL**: Analyze the candidate's work experience and determine their experience level:
    - "Entry Level" (0-1 years): Fresh graduates, internships, or minimal professional experience
    - "Junior" (1-3 years): Some professional experience, early career roles
    - "Mid-Level" (3-7 years): Solid professional experience, can work independently
    - "Senior" (7-12 years): Advanced experience, can lead projects and mentor others
    - "Lead/Principal" (12+ years): Expert level, can architect solutions and lead teams
    - Consider total years of experience, complexity of roles, leadership responsibilities
    - Return a single string value

11. **RECOMMENDED_ROLES**: Based on the candidate's skills, experience, education, and projects, recommend 2-3 specific job roles they would be suitable for:
    - Consider their technical skills, domain expertise, and career progression
    - Include roles that match their current skill level and potential growth areas
    - Format as array of role titles (e.g., ["Software Engineer", "Data Analyst", "Frontend Developer"])
    - Be specific and industry-relevant

12. **ATS_FEEDBACK**: Provide objective evaluation:
    - score: 0-100 based on ATS compatibility
    - summary: Brief assessment
    - strengths: Positive aspects
    - improvements: Areas for enhancement 

Guidelines:
- Detect section names dynamically (e.g., "Profile", "About Me", "Objective" → summary).
- CRITICAL: Skills extraction must be source-aware:
  * "skills" array: ONLY from dedicated skills sections (Skills, Technical Skills, Core Skills, Programming Languages, etc.)
  * "extra_skills" array: Skills mentioned in work experience, projects, certifications, education, or other contexts
- Extract job/project details separately.
- For EXPERIENCE_LEVEL: Analyze total years of professional experience, role complexity, and leadership indicators
- For RECOMMENDED_ROLES: Analyze the candidate's profile holistically and suggest roles that align with their skills and experience level
- Be consistent and produce clean JSON only.
- Prioritize accuracy over completeness.
- IMPORTANT: Return ONLY the JSON object, no additional text or explanations.

Resume Text:
{resume_text}
""")

# Build the chain with StrOutputParser for better JSON handling
resume_parser_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    output_parser=StrOutputParser()
)


In [37]:
# Step 1: Extract text
resume_path = "app/resumes/SaidivyaResume.pdf"
text = extract_text_from_file(resume_path, "pdf")

# Step 2: Parse with Gemini
try:
    raw_output = resume_parser_chain.run({"resume_text": text})
    # print("Raw LLM Output:")
    # print(raw_output)
    # print("\n" + "="*50 + "\n")
    
    # Parse JSON from the output
    structured_output = json.loads(raw_output)
    print("Parsed JSON Output:")
    print(json.dumps(structured_output, indent=2))
    
    # Display experience level prominently
    if "experience_level" in structured_output and structured_output["experience_level"]:
        print("\n" + "="*60)
        print("📊 EXPERIENCE LEVEL:")
        print("="*60)
        print(f"Level: {structured_output['experience_level']}")
        print("="*60)
    
    # Display recommended roles prominently
    if "recommended_roles" in structured_output and structured_output["recommended_roles"]:
        print("\n" + "="*60)
        print("🎯 RECOMMENDED ROLES FOR THIS CANDIDATE:")
        print("="*60)
        for i, role in enumerate(structured_output["recommended_roles"], 1):
            print(f"{i}. {role}")
        print("="*60)
    
except json.JSONDecodeError as e:
    print(f"JSON Parsing Error: {e}")
    print("Raw output that failed to parse:")
    print(raw_output)
except Exception as e:
    print(f"Error: {e}")
    print("Raw output:")
    print(raw_output)


JSON Parsing Error: Expecting value: line 1 column 1 (char 0)
Raw output that failed to parse:
```json
{
  "name": "KODIPAKA SAIDIVYA",
  "location": "India",
  "summary": "I am a dedicated student and avid continuous learner with a strong proficiency in electrical and electronics engineering. I have gained valuable hands-on experience during a 6-month internship at a TS Transco substation. Currently, I am expanding my expertise in data science, SQL, and Python to keep pace with emerging technological advancements.",
  "skills": [
    "Python",
    "C",
    "SQL",
    "Data Structures",
    "Data Science",
    "MS Excel",
    "AutoCAD",
    "CMOS VLSI Design",
    "Circuit Designing",
    "Electronics Enthusiast",
    "Electrical Proficiency",
    "Quick Learning",
    "problem solving",
    "Adaptability",
    "Leadership qualities"
  ],
  "extra_skills": [
    "user interaction",
    "automation",
    "advanced grid technologies",
    "smart grid systems",
    "renewable energy integ

In [38]:
# ============================================================================
# ADVANCED JOB MATCHING SYSTEM WITH EMBEDDINGS
# ============================================================================
# This section contains the embedding-based job matching system
# Move this to the bottom as requested

# Additional imports for embedding system
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')


In [49]:
# Advanced Embedding Service for Job Matching
class EmbeddingService:
    def __init__(self):
        """Initialize the embedding service with TF-IDF vectorizer"""
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.8
        )
        
    def create_resume_embeddings(self, resume_data: dict) -> dict:
        """
        Create both global and section-based embeddings for resume
        
        Parameters:
        -----------
        resume_data : dict
            Parsed resume data with all sections
            
        Returns:
        --------
        dict
            Dictionary containing different types of embeddings
        """
        embeddings = {}
        
        # 1. Global Resume Embedding (entire resume context)
        global_text = self._build_global_resume_text(resume_data)
        embeddings['global'] = global_text
        
        # 2. Skills Embedding
        skills_text = self._build_skills_text(resume_data)
        embeddings['skills'] = skills_text
        
        # 3. Experience Embedding
        experience_text = self._build_experience_text(resume_data)
        embeddings['experience'] = experience_text
        
        # 4. Combined Section Embedding (for weighted similarity)
        combined_sections = f"{skills_text} {experience_text}"
        embeddings['sections'] = combined_sections
        
        return embeddings
    
    def create_job_embeddings(self, job_data: dict) -> dict:
        """
        Create embeddings for job posting
        
        Parameters:
        -----------
        job_data : dict
            Job data containing title, description, etc.
            
        Returns:
        --------
        dict
            Dictionary containing different types of job embeddings
        """
        embeddings = {}
        
        # 1. Global Job Embedding (title + description + company)
        global_job_text = f"{job_data.get('title', '')} {job_data.get('description', '')} {job_data.get('company', '')}"
        embeddings['global'] = global_job_text
        
        # 2. Requirements Embedding (extracted from description)
        requirements_text = self._extract_requirements(job_data.get('description', ''))
        embeddings['requirements'] = requirements_text
        
        # 3. Combined Job Context
        combined_job = f"{job_data.get('title', '')} {requirements_text}"
        embeddings['context'] = combined_job
        
        return embeddings
    
    def _build_global_resume_text(self, resume_data: dict) -> str:
        """Build comprehensive resume text for global embedding"""
        parts = []
        
        # Add summary
        if resume_data.get('summary'):
            parts.append(resume_data['summary'])
        
        # Add skills
        if resume_data.get('skills'):
            parts.extend(resume_data['skills'])
        
        # Add extra skills
        if resume_data.get('extra_skills'):
            parts.extend(resume_data['extra_skills'])
        
        # Add work experience
        if resume_data.get('work_experience'):
            for exp in resume_data['work_experience']:
                parts.append(f"{exp.get('job_title', '')} at {exp.get('company', '')}")
                if isinstance(exp.get('description'), list):
                    parts.extend(exp['description'])
                elif exp.get('description'):
                    parts.append(exp['description'])
        
        # Add projects
        if resume_data.get('projects'):
            for project in resume_data['projects']:
                parts.append(f"Project: {project.get('project_name', '')}")
                if isinstance(project.get('description'), list):
                    parts.extend(project['description'])
                elif project.get('description'):
                    parts.append(project['description'])
        
        # Add education
        if resume_data.get('education'):
            for edu in resume_data['education']:
                parts.append(f"{edu.get('degree', '')} from {edu.get('institution', '')}")
        
        return ' '.join(parts)
    
    def _build_skills_text(self, resume_data: dict) -> str:
        """Build skills-focused text"""
        skills_parts = []
        
        if resume_data.get('skills'):
            skills_parts.extend(resume_data['skills'])
        
        if resume_data.get('extra_skills'):
            skills_parts.extend(resume_data['extra_skills'])
        
        # Add technical skills from experience
        if resume_data.get('work_experience'):
            for exp in resume_data['work_experience']:
                if isinstance(exp.get('description'), list):
                    skills_parts.extend(exp['description'])
                elif exp.get('description'):
                    skills_parts.append(exp['description'])
        
        return ' '.join(skills_parts)
    
    def _build_experience_text(self, resume_data: dict) -> str:
        """Build experience-focused text"""
        experience_parts = []
        
        if resume_data.get('work_experience'):
            for exp in resume_data['work_experience']:
                exp_text = f"{exp.get('job_title', '')} {exp.get('company', '')}"
                if isinstance(exp.get('description'), list):
                    exp_text += ' ' + ' '.join(exp['description'])
                elif exp.get('description'):
                    exp_text += ' ' + exp['description']
                experience_parts.append(exp_text)
        
        # Add projects as experience
        if resume_data.get('projects'):
            for project in resume_data['projects']:
                project_text = f"Project: {project.get('project_name', '')}"
                if isinstance(project.get('description'), list):
                    project_text += ' ' + ' '.join(project['description'])
                elif project.get('description'):
                    project_text += ' ' + project['description']
                experience_parts.append(project_text)
        
        return ' '.join(experience_parts)
    
    def _extract_requirements(self, job_description: str) -> str:
        """Extract key requirements and qualifications from job description"""
        if not job_description:
            return ""
        
        # Simple keyword extraction for requirements
        # Look for common requirement patterns
        requirements_keywords = [
            'requirements', 'qualifications', 'skills', 'experience',
            'must have', 'should have', 'preferred', 'bachelor', 'master',
            'years of experience', 'proficient', 'knowledge of'
        ]
        
        sentences = job_description.split('.')
        requirement_sentences = []
        
        for sentence in sentences:
            sentence_lower = sentence.lower()
            if any(keyword in sentence_lower for keyword in requirements_keywords):
                requirement_sentences.append(sentence.strip())
        
        return ' '.join(requirement_sentences[:5])  # Take first 5 relevant sentences


In [50]:
# Advanced Job Matching System with Weighted Similarity
class JobMatchingService:
    def __init__(self, embedding_service: EmbeddingService):
        """
        Initialize the job matching service
        
        Parameters:
        -----------
        embedding_service : EmbeddingService
            Service for creating embeddings
        """
        self.embedding_service = embedding_service
        self.tfidf_vectorizer = embedding_service.tfidf_vectorizer
        
        # Default weights for similarity calculation
        self.weights = {
            'global': 0.5,      # Global resume vs global job
            'skills': 0.3,      # Skills vs requirements
            'experience': 0.2   # Experience vs job description
        }
    
    def calculate_similarity(self, resume_embeddings: dict, job_embeddings: dict) -> dict:
        """
        Calculate weighted similarity between resume and job
        
        Parameters:
        -----------
        resume_embeddings : dict
            Resume embeddings (global, skills, experience, sections)
        job_embeddings : dict
            Job embeddings (global, requirements, context)
            
        Returns:
        --------
        dict
            Similarity scores and final weighted score
        """
        similarities = {}
        
        try:
            # 1. Global Similarity (resume global vs job global)
            global_sim = self._calculate_text_similarity(
                resume_embeddings['global'], 
                job_embeddings['global']
            )
            similarities['global'] = global_sim
            
            # 2. Skills Similarity (resume skills vs job requirements)
            skills_sim = self._calculate_text_similarity(
                resume_embeddings['skills'], 
                job_embeddings['requirements']
            )
            similarities['skills'] = skills_sim
            
            # 3. Experience Similarity (resume experience vs job context)
            experience_sim = self._calculate_text_similarity(
                resume_embeddings['experience'], 
                job_embeddings['context']
            )
            similarities['experience'] = experience_sim
            
            # 4. Calculate weighted final score
            final_score = (
                self.weights['global'] * similarities['global'] +
                self.weights['skills'] * similarities['skills'] +
                self.weights['experience'] * similarities['experience']
            )
            
            similarities['final_score'] = final_score
            
            return similarities
            
        except Exception as e:
            print(f"Error calculating similarity: {e}")
            return {
                'global': 0.0,
                'skills': 0.0,
                'experience': 0.0,
                'final_score': 0.0,
                'error': str(e)
            }
    
    def _calculate_text_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate cosine similarity between two text strings using TF-IDF
        
        Parameters:
        -----------
        text1 : str
            First text string
        text2 : str
            Second text string
            
        Returns:
        --------
        float
            Cosine similarity score (0-1)
        """
        if not text1.strip() or not text2.strip():
            return 0.0
        
        try:
            # Vectorize the texts
            tfidf_matrix = self.tfidf_vectorizer.fit_transform([text1, text2])
            
            # Calculate cosine similarity
            similarity_matrix = cosine_similarity(tfidf_matrix)
            
            # Return similarity between the two texts
            return similarity_matrix[0][1]
            
        except Exception as e:
            print(f"Error in text similarity calculation: {e}")
            return 0.0
    
    def match_jobs(self, resume_data: dict, jobs_df: pd.DataFrame, 
                   threshold: float = 0.3, top_n: int = 10) -> pd.DataFrame:
        """
        Match jobs with resume and return filtered results
        
        Parameters:
        -----------
        resume_data : dict
            Parsed resume data
        jobs_df : pd.DataFrame
            DataFrame containing job postings
        threshold : float
            Minimum similarity threshold (0-1)
        top_n : int
            Maximum number of top matches to return
            
        Returns:
        --------
        pd.DataFrame
            Filtered and ranked job matches
        """
        print(f"🎯 Starting job matching for {len(jobs_df)} jobs...")
        print(f"📊 Using threshold: {threshold}, Top N: {top_n}")
        
        # Create resume embeddings
        resume_embeddings = self.embedding_service.create_resume_embeddings(resume_data)
        
        # Calculate similarities for each job
        job_scores = []
        
        for idx, job_row in jobs_df.iterrows():
            job_data = {
                'title': job_row.get('title', ''),
                'description': job_row.get('description', ''),
                'company': job_row.get('company', ''),
                'location': job_row.get('location', ''),
                'link': job_row.get('link', '')
            }
            
            # Create job embeddings
            job_embeddings = self.embedding_service.create_job_embeddings(job_data)
            
            # Calculate similarity
            similarities = self.calculate_similarity(resume_embeddings, job_embeddings)
            
            # Add job info and scores
            job_info = {
                'job_index': idx,
                'title': job_data['title'],
                'company': job_data['company'],
                'location': job_data['location'],
                'link': job_data['link'],
                'global_similarity': similarities['global'],
                'skills_similarity': similarities['skills'],
                'experience_similarity': similarities['experience'],
                'final_score': similarities['final_score']
            }
            
            job_scores.append(job_info)
        
        # Convert to DataFrame and filter
        results_df = pd.DataFrame(job_scores)
        
        # Filter by threshold
        filtered_df = results_df[results_df['final_score'] >= threshold].copy()
        
        # Sort by final score (descending)
        filtered_df = filtered_df.sort_values('final_score', ascending=False)
        
        # Take top N results
        if top_n > 0:
            filtered_df = filtered_df.head(top_n)
        
        print(f"✅ Found {len(filtered_df)} jobs above threshold {threshold}")
        
        return filtered_df
    
    def update_weights(self, global_weight: float = None, skills_weight: float = None, 
                      experience_weight: float = None):
        """
        Update similarity calculation weights
        
        Parameters:
        -----------
        global_weight : float
            Weight for global similarity
        skills_weight : float
            Weight for skills similarity
        experience_weight : float
            Weight for experience similarity
        """
        if global_weight is not None:
            self.weights['global'] = global_weight
        if skills_weight is not None:
            self.weights['skills'] = skills_weight
        if experience_weight is not None:
            self.weights['experience'] = experience_weight
        
        # Normalize weights to sum to 1
        total_weight = sum(self.weights.values())
        if total_weight > 0:
            for key in self.weights:
                self.weights[key] = self.weights[key] / total_weight
        
        print(f"🔄 Updated weights: {self.weights}")

# Initialize services
embedding_service = EmbeddingService()
job_matching_service = JobMatchingService(embedding_service)


In [51]:
# Demo: Advanced Job Matching with Real Data
def demo_job_matching():
    """
    Demo function to test the advanced job matching system
    """
    print("🚀 AI Career System - Advanced Job Matching Demo")
    print("="*70)
    
    # Load the scraped jobs data
    jobs_file = "jobs_KODIPAKA_SAIDIVYA_20251008_151622.csv"
    
    try:
        jobs_df = pd.read_csv(jobs_file)
        print(f"✅ Loaded {len(jobs_df)} jobs from {jobs_file}")
        
        # Display sample job data
        print(f"\n📊 Sample Job Data:")
        print(f"Columns: {list(jobs_df.columns)}")
        print(f"First job title: {jobs_df.iloc[0]['title']}")
        print(f"First company: {jobs_df.iloc[0]['company']}")
        
    except FileNotFoundError:
        print(f"❌ Jobs file not found: {jobs_file}")
        print("Please run the job scraping first or provide a valid CSV file")
        return None
    except Exception as e:
        print(f"❌ Error loading jobs: {e}")
        return None
    
    # Sample resume data (you can replace this with actual parsed resume data)
    sample_resume = {
        "name": "KODIPAKA SAIDIVYA",
        "location": "India",
        "summary": "Dedicated student with strong proficiency in electrical and electronics engineering. Experience in data science, SQL, and Python. 6-month internship at TS Transco substation.",
        "skills": ["Python", "C", "SQL", "Data Structures", "Data Science", "MS Excel", "AutoCAD", "CMOS VLSI Design", "Circuit Designing"],
        "extra_skills": ["MATLAB", "ML", "Simulink", "Reinforcement Learning", "AI-ML", "Electrical and Electronics Engineering"],
        "work_experience": [
            {
                "job_title": "Python Developer (Intern)",
                "company": "CodeAlpha India",
                "duration": "01/06/24 – 30/06/24",
                "description": ["Developed a Hangman game", "Created a small chatbot"]
            }
        ],
        "projects": [
            {
                "project_name": "UAV Autonomous Navigation",
                "description": ["Used RL algorithms (Q-learning, DQN)", "Implemented autonomous navigation system"]
            }
        ],
        "education": [
            {
                "degree": "Bachelor of Technology in Electrical and Electronics Engineering",
                "institution": "University Name"
            }
        ],
        "experience_level": "Entry Level",
        "recommended_roles": ["Data Analyst", "Software Engineer", "Python Developer"]
    }
    
    print(f"\n👤 Resume Analysis:")
    print(f"Name: {sample_resume['name']}")
    print(f"Experience Level: {sample_resume['experience_level']}")
    print(f"Key Skills: {sample_resume['skills'][:5]}")
    
    # Test the matching system
    print(f"\n🎯 Running Job Matching...")
    
    # Test with different thresholds
    thresholds = [0.2, 0.3, 0.4, 0.5]
    
    for threshold in thresholds:
        print(f"\n📊 Testing with threshold: {threshold}")
        matched_jobs = job_matching_service.match_jobs(
            resume_data=sample_resume,
            jobs_df=jobs_df,
            threshold=threshold,
            top_n=5
        )
        
        if len(matched_jobs) > 0:
            print(f"✅ Found {len(matched_jobs)} matching jobs:")
            for idx, job in matched_jobs.iterrows():
                print(f"  {idx+1}. {job['title']} at {job['company']} (Score: {job['final_score']:.3f})")
        else:
            print(f"⚠️ No jobs found above threshold {threshold}")
    
    return matched_jobs if 'matched_jobs' in locals() else None

# Run the demo
print("🧪 Running Job Matching Demo...")
demo_results = demo_job_matching()


🧪 Running Job Matching Demo...
🚀 AI Career System - Advanced Job Matching Demo
✅ Loaded 69 jobs from jobs_KODIPAKA_SAIDIVYA_20251008_151622.csv

📊 Sample Job Data:
Columns: ['title', 'company', 'location', 'link', 'description', 'searched_for', 'experience_level_filter', 'days_back', 'scraped_at']
First job title: Graduate Engineering Trainee
First company: Adani Electricity

👤 Resume Analysis:
Name: KODIPAKA SAIDIVYA
Experience Level: Entry Level
Key Skills: ['Python', 'C', 'SQL', 'Data Structures', 'Data Science']

🎯 Running Job Matching...

📊 Testing with threshold: 0.2
🎯 Starting job matching for 69 jobs...
📊 Using threshold: 0.2, Top N: 5
Error in text similarity calculation: max_df corresponds to < documents than min_df
Error in text similarity calculation: max_df corresponds to < documents than min_df
Error in text similarity calculation: max_df corresponds to < documents than min_df
Error in text similarity calculation: max_df corresponds to < documents than min_df
Error in tex

In [42]:
import os 
from dotenv import load_dotenv
import pdfplumber
import docx2txt

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser


In [43]:
# Load .env for API key
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

In [44]:
if not api_key:
    raise ValueError("❌ GOOGLE_API_KEY not found in .env file")

In [45]:
# Initialize Gemini 2.5 Flash LLM with safe optimizations (maintains consistency)
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2,  # Restore original temperature for consistency
    api_key=api_key,
    # max_output_tokens=4096,  # Set explicit limit for faster response
    max_retries=2,  # Keep reasonable retries for reliability
)

E0000 00:00:1759938783.987993   13497 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [46]:

def extract_text_from_file(file_path: str, file_type: str) -> str:
    """Extract text from PDF or DOCX resume."""
    if file_type.lower() == "pdf":
        text_parts = []
        with pdfplumber.open(file_path) as pdf:
            max_pages = min(2, len(pdf.pages))
            for i in range(max_pages):
                text_parts.append(pdf.pages[i].extract_text() or "")
        text = "\n".join(text_parts)
    elif file_type.lower() == "docx":
        text = docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type")

    # Optional cleaning
    text = text.replace('\n', ' ').replace('\t', ' ').strip()
    return text


In [47]:
# Combined Unified Resume Extraction with JSON Output (Fixed Parser Issue)
import json

# Create a LangChain prompt with direct JSON output
prompt = ChatPromptTemplate.from_template("""
You are an expert Resume Intelligence Agent that extracts structured data and evaluates resumes for ATS compatibility.

Analyze the following resume text and return ONLY a valid JSON object with these exact keys:

{{
  "name": "",
  "location": "",
  "summary": "",
  "skills": [],
  "extra_skills": [],
  "work_experience": [],
  "projects": [],
  "certifications": [],
  "education": [],
  "experience_level": "",
  "recommended_roles": [],
  "ats_feedback": {{
    "score": 0,
    "summary": "",
    "strengths": [],
    "improvements": []
  }}
}}

CRITICAL EXTRACTION RULES FOR ALL SECTIONS:

1. **NAME**: Extract the full name exactly as written, using the most prominent name (usually at the top).

2. **LOCATION**: Extract specific city and country from contact info, address, or personal details. Format as "City, Country" (e.g., "Chennai, India", "Bangalore, India"). If no city is specified, use " Country".

3. **SUMMARY**: Look for sections titled "Summary", "Objective", "Profile", "About Me", "Career Summary". Extract complete professional summary.

4. **SKILLS**: Extract skills ONLY from dedicated "Skills", "Technical Skills", "Core Skills", "Programming Languages", or similar sections:
   - ONLY include skills explicitly listed in a dedicated skills section
   - Programming languages, frameworks, tools, technologies mentioned in skills section
   - Return as array of individual skills from the skills section only

5. **EXTRA_SKILLS**: Extract additional skills mentioned in other contexts:
   - Skills mentioned in work experience descriptions
   - Technologies used in projects
   - Skills mentioned in certifications or education
   - Any other skills not in the main skills section
   - Return as array of individual skills from non-skills sections

6. **WORK_EXPERIENCE**: Extract each position with:
   - Job title, company, duration, location
   - Key responsibilities and achievements
   - Format as structured objects with consistent fields

7. **PROJECTS**: Extract personal/academic projects with:
   - Project name, duration, technologies used
   - Brief description and key features
   - Any notable achievements or results

8. **CERTIFICATIONS**: Extract all certifications with:
   - Certification name, issuing organization, year
   - Include online courses, professional certifications

9. **EDUCATION**: Extract educational background with:
   - Degree, institution, graduation year
   - Relevant coursework or achievements

10. **EXPERIENCE_LEVEL**: Analyze the candidate's work experience and determine their experience level:
    - "Entry Level" (0-1 years): Fresh graduates, internships, or minimal professional experience
    - "Junior" (1-3 years): Some professional experience, early career roles
    - "Mid-Level" (3-7 years): Solid professional experience, can work independently
    - "Senior" (7-12 years): Advanced experience, can lead projects and mentor others
    - "Lead/Principal" (12+ years): Expert level, can architect solutions and lead teams
    - Consider total years of experience, complexity of roles, leadership responsibilities
    - Return a single string value

11. **RECOMMENDED_ROLES**: Based on the candidate's skills, experience, education, and projects, recommend 2-3 specific job roles they would be suitable for:
    - Consider their technical skills, domain expertise, and career progression
    - Include roles that match their current skill level and potential growth areas
    - Format as array of role titles (e.g., ["Software Engineer", "Data Analyst", "Frontend Developer"])
    - Be specific and industry-relevant

12. **ATS_FEEDBACK**: Provide objective evaluation:
    - score: 0-100 based on ATS compatibility
    - summary: Brief assessment
    - strengths: Positive aspects
    - improvements: Areas for enhancement 

Guidelines:
- Detect section names dynamically (e.g., "Profile", "About Me", "Objective" → summary).
- CRITICAL: Skills extraction must be source-aware:
  * "skills" array: ONLY from dedicated skills sections (Skills, Technical Skills, Core Skills, Programming Languages, etc.)
  * "extra_skills" array: Skills mentioned in work experience, projects, certifications, education, or other contexts
- Extract job/project details separately.
- For EXPERIENCE_LEVEL: Analyze total years of professional experience, role complexity, and leadership indicators
- For RECOMMENDED_ROLES: Analyze the candidate's profile holistically and suggest roles that align with their skills and experience level
- Be consistent and produce clean JSON only.
- Prioritize accuracy over completeness.
- IMPORTANT: Return ONLY the JSON object, no additional text or explanations.

Resume Text:
{resume_text}
""")

# Build the chain with StrOutputParser for better JSON handling
resume_parser_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    output_parser=StrOutputParser()
)

In [48]:
# Step 1: Extract text
resume_path = "app/resumes/SaidivyaResume.pdf"
text = extract_text_from_file(resume_path, "pdf")

# Step 2: Parse with Gemini
try:
    raw_output = resume_parser_chain.run({"resume_text": text})
    # print("Raw LLM Output:")
    # print(raw_output)
    # print("\n" + "="*50 + "\n")
    
    # Parse JSON from the output
    structured_output = json.loads(raw_output)
    print("Parsed JSON Output:")
    print(json.dumps(structured_output, indent=2))
    
    # Display experience level prominently
    # if "experience_level" in structured_output and structured_output["experience_level"]:
    #     print("\n" + "="*60)
    #     print("📊 EXPERIENCE LEVEL:")
    #     print("="*60)
    #     print(f"Level: {structured_output['experience_level']}")
    #     print("="*60)
    
    # # Display recommended roles prominently
    # if "recommended_roles" in structured_output and structured_output["recommended_roles"]:
    #     print("\n" + "="*60)
    #     print("🎯 RECOMMENDED ROLES FOR THIS CANDIDATE:")
    #     print("="*60)
    #     for i, role in enumerate(structured_output["recommended_roles"], 1):
    #         print(f"{i}. {role}")
    #     print("="*60)
    
except json.JSONDecodeError as e:
    print(f"JSON Parsing Error: {e}")
    print("Raw output that failed to parse:")
    print(raw_output)
except Exception as e:
    print(f"Error: {e}")
    print("Raw output:")
    print(raw_output)


JSON Parsing Error: Expecting value: line 1 column 1 (char 0)
Raw output that failed to parse:
```json
{
  "name": "KODIPAKA SAIDIVYA",
  "location": "India",
  "summary": "I am a dedicated student and avid continuous learner with a strong proficiency in electrical and electronics engineering. I have gained valuable hands-on experience during a 6-month internship at a TS Transco substation. Currently, I am expanding my expertise in data science, SQL, and Python to keep pace with emerging technological advancements.",
  "skills": [
    "Python",
    "C",
    "SQL",
    "Data Structures",
    "Data Science",
    "MS Excel",
    "AutoCAD",
    "CMOS VLSI Design",
    "Circuit Designing",
    "Electronics Enthusiast",
    "Electrical Proficiency",
    "Quick Learning",
    "Problem Solving",
    "Adaptability",
    "Leadership Qualities"
  ],
  "extra_skills": [
    "Hangman Game Development",
    "Chatbot Development",
    "Advanced Grid Technologies",
    "Smart Grid Systems",
    "Renew

In [25]:
# LinkedIn Job Scraper Integration
# Import necessary packages for web scraping and logging
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd
import random
import time
import urllib.parse

# Configure logging settings
logging.basicConfig(filename="linkedin_scraping.log", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [26]:
class LinkedInJobScraper:
    def __init__(self, headless=False):
        """
        Initialize the LinkedIn Job Scraper
        
        Parameters:
        -----------
        headless : bool
            Whether to run Chrome in headless mode (default: False)
        """
        self.headless = headless
        self.driver = None
        self.setup_driver()
    
    def setup_driver(self):
        """Setup Chrome WebDriver with appropriate options"""
        try:
            options = webdriver.ChromeOptions()
            
            # Basic options
            options.add_argument("--start-maximized")
            options.add_argument("--no-sandbox")
            options.add_argument("--disable-dev-shm-usage")
            options.add_argument("--disable-blink-features=AutomationControlled")
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)
            
            # User agent to avoid detection
            options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
            
            if self.headless:
                options.add_argument("--headless")
            
            self.driver = webdriver.Chrome(options=options)
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            
            logging.info("Chrome WebDriver initialized successfully")
            
        except Exception as e:
            logging.error(f"Failed to initialize WebDriver: {str(e)}")
            raise
    
    def build_search_url(self, job_title: str, location: str = "India", experience_level: str = None, 
                        time_posted: str = None, remote: bool = False) -> str:
        """
        Build LinkedIn job search URL with filters
        
        Parameters:
        -----------
        job_title : str
            Job title to search for
        location : str
            Location to search in (default: "India")
        experience_level : str
            Experience level filter (Entry level, Associate, Mid-Senior level, Director, Executive)
        time_posted : str
            Time posted filter (r86400, r604800, r2592000, r31536000)
        remote : bool
            Whether to include remote jobs only
        
        Returns:
        --------
        str
            Complete LinkedIn job search URL
        """
        base_url = "https://www.linkedin.com/jobs/search/"
        
        # URL encode parameters
        job_title_encoded = urllib.parse.quote(job_title)
        location_encoded = urllib.parse.quote(location)
        
        # Build query parameters
        params = {
            "keywords": job_title_encoded,
            "location": location_encoded,
            "f_TPR": time_posted if time_posted else None,  # Time posted filter
            "f_E": self._get_experience_filter(experience_level) if experience_level else None,  # Experience filter
            
        }
        
        # Filter out None values and build query string
        query_params = {k: v for k, v in params.items() if v is not None}
        query_string = "&".join([f"{k}={v}" for k, v in query_params.items()])
        
        full_url = f"{base_url}?{query_string}"
        logging.info(f"Built search URL: {full_url}")
        
        return full_url
    
    def _get_experience_filter(self, experience_level: str) -> str:
        """
        Map experience level to LinkedIn filter values
        
        Parameters:
        -----------
        experience_level : str
            Experience level from resume analysis
        
        Returns:
        --------
        str
            LinkedIn experience filter value
        """
        experience_mapping = {
            "Entry Level": "1",           # Entry level
            "Junior": "2",                # Associate
            "Mid-Level": "3",             # Mid-Senior level
            "Senior": "4",                # Director
            "Lead/Principal": "5"         # Executive
        }
        
        return experience_mapping.get(experience_level, "1")  # Default to Entry level
    
    def _get_time_filter(self, days: int) -> str:
        """
        Get time posted filter based on days
        
        Parameters:
        -----------
        days : int
            Number of days to look back
        
        Returns:
        --------
        str
            LinkedIn time filter value
        """
        time_mapping = {
            1: "r86400",      # Past 24 hours
            7: "r604800",     # Past week
            30: "r2592000",   # Past month
            365: "r31536000"  # Past year
        }
        
        return time_mapping.get(days, "r604800")  # Default to past week


In [27]:
# Add the remaining methods to LinkedInJobScraper class
def add_methods_to_scraper():
    """Add the scrape_jobs and close methods to LinkedInJobScraper class"""
    
    def scrape_jobs(self, job_title: str, location: str = "India", pages: int = 1, 
                   experience_level: str = None, days_back: int = 7) -> list:
        """Scrape job listings from LinkedIn with filters"""
        logging.info(f'Starting LinkedIn job scrape for "{job_title}" in "{location}"...')
        
        # Build search URL with filters
        time_filter = self._get_time_filter(days_back)
        search_url = self.build_search_url(
            job_title=job_title,
            location=location,
            experience_level=experience_level,
            time_posted=time_filter
        )
        
        try:
            # Navigate to the LinkedIn job search page
            self.driver.get(search_url)
            time.sleep(3)  # Wait for page to load
            
            # Scroll through the specified number of pages
            for i in range(pages):
                logging.info(f"Scrolling to bottom of page {i+1}...")
                
                # Scroll to the bottom of the page using JavaScript
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                
                try:
                    # Wait for the "Show more" button to be present on the page
                    element = WebDriverWait(self.driver, 5).until(
                        EC.presence_of_element_located(
                            (By.XPATH, "/html/body/div[1]/div/main/section[2]/button")
                        )
                    )
                    # Click on the "Show more" button
                    element.click()
                    logging.info("Clicked 'Show more' button")
                    
                except Exception:
                    logging.info("Show more button not found or not clickable")
                
                # Wait for a random amount of time before scrolling to the next page
                time.sleep(random.choice(list(range(3, 7))))
            
            # Scrape the job postings
            jobs = []
            soup = BeautifulSoup(self.driver.page_source, "html.parser")
            
            # Updated selectors for current LinkedIn structure
            job_listings = soup.find_all(
                "div",
                class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
            )
            
            logging.info(f"Found {len(job_listings)} job listings to process")
            
            for idx, job in enumerate(job_listings):
                try:
                    # Extract job details with error handling
                    job_title_elem = job.find("h3", class_="base-search-card__title")
                    job_title_text = job_title_elem.text.strip() if job_title_elem else "N/A"
                    
                    job_company_elem = job.find("h4", class_="base-search-card__subtitle")
                    job_company_text = job_company_elem.text.strip() if job_company_elem else "N/A"
                    
                    job_location_elem = job.find("span", class_="job-search-card__location")
                    job_location_text = job_location_elem.text.strip() if job_location_elem else "N/A"
                    
                    apply_link_elem = job.find("a", class_="base-card__full-link")
                    apply_link = apply_link_elem["href"] if apply_link_elem else "N/A"
                    
                    # Navigate to the job posting page and scrape the description
                    if apply_link != "N/A":
                        self.driver.get(apply_link)
                        time.sleep(random.choice(list(range(5, 11))))
                        
                        try:
                            description_soup = BeautifulSoup(self.driver.page_source, "html.parser")
                            job_description_elem = description_soup.find(
                                "div", class_="description__text description__text--rich"
                            )
                            job_description = job_description_elem.text.strip() if job_description_elem else "Description not available"
                        except Exception as e:
                            logging.warning(f"Could not retrieve job description: {str(e)}")
                            job_description = "Description not available"
                    else:
                        job_description = "Description not available"
                    
                    # Add job details to the jobs list
                    job_data = {
                        "title": job_title_text,
                        "company": job_company_text,
                        "location": job_location_text,
                        "link": apply_link,
                        "description": job_description,
                        "searched_for": job_title,
                        "experience_level_filter": experience_level,
                        "days_back": days_back,
                        "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S")
                    }
                    
                    jobs.append(job_data)
                    logging.info(f'Scraped "{job_title_text}" at {job_company_text} in {job_location_text}...')
                    
                except Exception as e:
                    logging.error(f"Error processing job listing {idx}: {str(e)}")
                    continue
            
            logging.info(f"Successfully scraped {len(jobs)} jobs")
            return jobs
            
        except Exception as e:
            logging.error(f"An error occurred while scraping jobs: {str(e)}")
            return []
    
    def close(self):
        """Close the WebDriver"""
        if self.driver:
            self.driver.quit()
            logging.info("WebDriver closed successfully")
    
    # Add methods to the class
    LinkedInJobScraper.scrape_jobs = scrape_jobs
    LinkedInJobScraper.close = close

# Execute the function to add methods to the class
add_methods_to_scraper()


In [28]:
# Utility functions for job scraping and data management

def save_jobs_to_csv(jobs_data: list, filename: str = "linkedin_jobs.csv") -> None:
    """
    Save job data to a CSV file.
    
    Args:
        jobs_data: A list of dictionaries containing job data
        filename: Name of the CSV file to save to
    
    Returns:
        None
    """
    if not jobs_data:
        logging.warning("No job data to save")
        return
    
    try:
        # Create a pandas DataFrame from the job data
        df = pd.DataFrame(jobs_data)
        
        # Save the DataFrame to a CSV file without including the index column
        df.to_csv(filename, index=False)
        
        # Log a message indicating how many jobs were successfully scraped and saved
        logging.info(f"Successfully saved {len(jobs_data)} jobs to {filename}")
        print(f"✅ Saved {len(jobs_data)} jobs to {filename}")
        
    except Exception as e:
        logging.error(f"Error saving jobs to CSV: {str(e)}")
        print(f"❌ Error saving jobs to CSV: {str(e)}")

def scrape_jobs_for_resume(resume_data: dict, pages_per_role: int = 1, days_back: int = 7) -> list:
    """
    Scrape LinkedIn jobs based on resume analysis results
    
    Parameters:
    -----------
    resume_data : dict
        Resume analysis data containing recommended_roles and experience_level
    pages_per_role : int
        Number of pages to scrape for each recommended role
    days_back : int
        Number of days to look back for job postings
    
    Returns:
    --------
    list
        Combined list of all scraped jobs
    """
    all_jobs = []
    
    # Extract recommended roles and experience level
    recommended_roles = resume_data.get("recommended_roles", [])
    experience_level = resume_data.get("experience_level", "")
    
    if not recommended_roles:
        logging.warning("No recommended roles found in resume data")
        print("⚠️ No recommended roles found in resume data")
        return []
    
    print(f"🎯 Found {len(recommended_roles)} recommended roles: {recommended_roles}")
    print(f"📊 Experience Level: {experience_level}")
    print(f"⏰ Looking for jobs posted in the last {days_back} days")
    print("="*80)
    
    # Initialize scraper
    scraper = LinkedInJobScraper(headless=False)  # Set to True for headless mode
    
    try:
        for i, role in enumerate(recommended_roles, 1):
            print(f"\n🔍 Scraping jobs for role {i}/{len(recommended_roles)}: '{role}'")
            
            # Scrape jobs for this role
            jobs = scraper.scrape_jobs(
                job_title=role,
                location="India",
                pages=pages_per_role,
                experience_level=experience_level,
                days_back=days_back
            )
            
            if jobs:
                all_jobs.extend(jobs)
                print(f"✅ Found {len(jobs)} jobs for '{role}'")
            else:
                print(f"⚠️ No jobs found for '{role}'")
            
            # Add delay between role searches to be respectful
            if i < len(recommended_roles):
                time.sleep(random.choice(list(range(5, 10))))
        
        print(f"\n🎉 Total jobs scraped: {len(all_jobs)}")
        
    except Exception as e:
        logging.error(f"Error during job scraping: {str(e)}")
        print(f"❌ Error during job scraping: {str(e)}")
    
    finally:
        # Always close the scraper
        scraper.close()
    
    return all_jobs

def analyze_and_scrape_jobs(resume_path: str, file_type: str = "pdf", 
                          pages_per_role: int = 1, days_back: int = 7) -> dict:
    """
    Complete pipeline: Analyze resume and scrape relevant jobs
    
    Parameters:
    -----------
    resume_path : str
        Path to the resume file
    file_type : str
        Type of resume file (pdf or docx)
    pages_per_role : int
        Number of pages to scrape for each recommended role
    days_back : int
        Number of days to look back for job postings
    
    Returns:
    --------
    dict
        Combined resume analysis and job scraping results
    """
    print("🚀 Starting Resume Analysis and Job Scraping Pipeline")
    print("="*80)
    
    # Step 1: Extract and analyze resume
    print("📄 Step 1: Analyzing resume...")
    text = extract_text_from_file(resume_path, file_type)
    
    try:
        raw_output = resume_parser_chain.run({"resume_text": text})
        resume_data = json.loads(raw_output)
        
        print("✅ Resume analysis completed")
        print(f"👤 Candidate: {resume_data.get('name', 'N/A')}")
        print(f"📍 Location: {resume_data.get('location', 'N/A')}")
        print(f"📊 Experience Level: {resume_data.get('experience_level', 'N/A')}")
        print(f"🎯 Recommended Roles: {resume_data.get('recommended_roles', [])}")
        
    except Exception as e:
        print(f"❌ Error analyzing resume: {str(e)}")
        return {"error": f"Resume analysis failed: {str(e)}"}
    
    # Step 2: Scrape jobs based on analysis
    print(f"\n🔍 Step 2: Scraping jobs for recommended roles...")
    scraped_jobs = scrape_jobs_for_resume(resume_data, pages_per_role, days_back)
    
    # Step 3: Combine results
    result = {
        "resume_analysis": resume_data,
        "scraped_jobs": scraped_jobs,
        "summary": {
            "total_jobs_found": len(scraped_jobs),
            "recommended_roles_searched": resume_data.get("recommended_roles", []),
            "experience_level": resume_data.get("experience_level", ""),
            "scraping_date": time.strftime("%Y-%m-%d %H:%M:%S")
        }
    }
    
    # Step 4: Save results
    if scraped_jobs:
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        csv_filename = f"jobs_{resume_data.get('name', 'candidate').replace(' ', '_')}_{timestamp}.csv"
        save_jobs_to_csv(scraped_jobs, csv_filename)
        result["csv_file"] = csv_filename
    
    print("\n🎉 Pipeline completed successfully!")
    print("="*80)
    
    return result


In [29]:
# Fixed version with better error handling
def analyze_and_scrape_jobs_fixed(resume_path: str, file_type: str = "pdf", 
                                pages_per_role: int = 1, days_back: int = 7) -> dict:
    """
    Complete pipeline: Analyze resume and scrape relevant jobs (with better error handling)
    """
    print("🚀 Starting Resume Analysis and Job Scraping Pipeline")
    print("="*80)
    
    # Step 1: Extract and analyze resume
    print("📄 Step 1: Analyzing resume...")
    text = extract_text_from_file(resume_path, file_type)
    
    try:
        raw_output = resume_parser_chain.run({"resume_text": text})
        
        # Clean the output to handle markdown formatting
        cleaned_output = raw_output.strip()
        if cleaned_output.startswith("```json"):
            cleaned_output = cleaned_output[7:]
        if cleaned_output.endswith("```"):
            cleaned_output = cleaned_output[:-3]
        cleaned_output = cleaned_output.strip()
        
        # Try to parse JSON
        try:
            resume_data = json.loads(cleaned_output)
        except json.JSONDecodeError as e:
            print(f"❌ JSON parsing failed: {e}")
            
            # Try to extract JSON from the output
            json_start = cleaned_output.find('{')
            if json_start != -1:
                json_part = cleaned_output[json_start:]
                try:
                    resume_data = json.loads(json_part)
                    print("✅ JSON parsing successful after extraction!")
                except json.JSONDecodeError as e2:
                    print(f"❌ Still failed after extraction: {e2}")
                    return {"error": f"JSON parsing failed: {e2}"}
            else:
                return {"error": f"No JSON found in output: {cleaned_output[:200]}..."}
        
        print("✅ Resume analysis completed")
        print(f"👤 Candidate: {resume_data.get('name', 'N/A')}")
        print(f"📍 Location: {resume_data.get('location', 'N/A')}")
        print(f"📊 Experience Level: {resume_data.get('experience_level', 'N/A')}")
        print(f"🎯 Recommended Roles: {resume_data.get('recommended_roles', [])}")
        
    except Exception as e:
        print(f"❌ Error analyzing resume: {str(e)}")
        return {"error": f"Resume analysis failed: {str(e)}"}
    
    # Step 2: Check if we have recommended roles
    recommended_roles = resume_data.get("recommended_roles", [])
    if not recommended_roles:
        print("⚠️ No recommended roles found. Cannot scrape jobs.")
        return {
            "resume_analysis": resume_data,
            "scraped_jobs": [],
            "summary": {
                "total_jobs_found": 0,
                "recommended_roles_searched": [],
                "experience_level": resume_data.get("experience_level", ""),
                "scraping_date": time.strftime("%Y-%m-%d %H:%M:%S"),
                "status": "No recommended roles found"
            }
        }
    
    # Step 3: Scrape jobs based on analysis
    print(f"\n🔍 Step 2: Scraping real jobs for {len(recommended_roles)} roles...")
    print("⚠️ This will open a browser window and scrape real LinkedIn jobs")
    
    # Initialize scraper
    scraper = LinkedInJobScraper(headless=False)  # Set to True for headless mode
    
    all_jobs = []
    
    try:
        for i, role in enumerate(recommended_roles, 1):
            print(f"\n🔍 Scraping jobs for role {i}/{len(recommended_roles)}: '{role}'")
            
            # Scrape jobs for this role
            jobs = scraper.scrape_jobs(
                job_title=role,
                location="India",
                pages=pages_per_role,
                experience_level=resume_data.get("experience_level", ""),
                days_back=days_back
            )
            
            if jobs:
                all_jobs.extend(jobs)
                print(f"✅ Found {len(jobs)} jobs for '{role}'")
            else:
                print(f"⚠️ No jobs found for '{role}'")
            
            # Add delay between role searches to be respectful
            if i < len(recommended_roles):
                print(f"⏳ Waiting before next search...")
                time.sleep(random.choice(list(range(5, 10))))
        
        print(f"\n🎉 Total real jobs scraped: {len(all_jobs)}")
        
    except Exception as e:
        logging.error(f"Error during job scraping: {str(e)}")
        print(f"❌ Error during job scraping: {str(e)}")
        all_jobs = []
    
    finally:
        # Always close the scraper
        scraper.close()
        print("🔒 Browser closed")
    
    # Step 4: Combine results
    result = {
        "resume_analysis": resume_data,
        "scraped_jobs": all_jobs,
        "summary": {
            "total_jobs_found": len(all_jobs),
            "recommended_roles_searched": recommended_roles,
            "experience_level": resume_data.get("experience_level", ""),
            "scraping_date": time.strftime("%Y-%m-%d %H:%M:%S"),
            "status": "Real LinkedIn jobs scraped"
        }
    }
    
    # Step 5: Save results
    if all_jobs:
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        candidate_name = resume_data.get('name', 'candidate').replace(' ', '_')
        csv_filename = f"jobs_{candidate_name}_{timestamp}.csv"
        save_jobs_to_csv(all_jobs, csv_filename)
        result["csv_file"] = csv_filename
        print(f"💾 Jobs saved to: {csv_filename}")
    else:
        print("⚠️ No jobs found to save")
    
    print("\n🎉 Pipeline completed successfully!")
    print("="*80)
    
    return result

# Test the fixed version
print("🧪 Testing Fixed Pipeline...")
result = analyze_and_scrape_jobs_fixed(
    resume_path="app/resumes/SaidivyaResume.pdf",
    file_type="pdf",
    pages_per_role=1,
    days_back=7
)

# Print summary safely
print("\n📊 PIPELINE SUMMARY:")
print("="*50)
if "summary" in result:
    print(f"Total Jobs Found: {result['summary']['total_jobs_found']}")
    print(f"Roles Searched: {result['summary']['recommended_roles_searched']}")
    print(f"Experience Level: {result['summary']['experience_level']}")
    print(f"Status: {result['summary']['status']}")
    if 'csv_file' in result:
        print(f"CSV File: {result['csv_file']}")
else:
    print(f"❌ Error: {result.get('error', 'Unknown error')}")


🧪 Testing Fixed Pipeline...
🚀 Starting Resume Analysis and Job Scraping Pipeline
📄 Step 1: Analyzing resume...
✅ Resume analysis completed
👤 Candidate: KODIPAKA SAIDIVYA
📍 Location: India
📊 Experience Level: Entry Level
🎯 Recommended Roles: ['Junior Electrical Engineer', 'Data Science Intern', 'Machine Learning Intern']

🔍 Step 2: Scraping real jobs for 3 roles...
⚠️ This will open a browser window and scrape real LinkedIn jobs

🔍 Scraping jobs for role 1/3: 'Junior Electrical Engineer'
✅ Found 15 jobs for 'Junior Electrical Engineer'
⏳ Waiting before next search...

🔍 Scraping jobs for role 2/3: 'Data Science Intern'
✅ Found 24 jobs for 'Data Science Intern'
⏳ Waiting before next search...

🔍 Scraping jobs for role 3/3: 'Machine Learning Intern'
✅ Found 30 jobs for 'Machine Learning Intern'

🎉 Total real jobs scraped: 69
🔒 Browser closed
✅ Saved 69 jobs to jobs_KODIPAKA_SAIDIVYA_20251008_151622.csv
💾 Jobs saved to: jobs_KODIPAKA_SAIDIVYA_20251008_151622.csv

🎉 Pipeline completed succe

In [None]:
# Simple function to scrape jobs for specific roles
def scrape_jobs_directly(job_roles: list, location: str = "India", pages: int = 1, 
                        experience_level: str = None, days_back: int = 7) -> list:
    """
    Directly scrape jobs for given roles without resume analysis
    
    Parameters:
    -----------
    job_roles : list
        List of job titles to search for
    location : str
        Location to search in (default: "India")
    pages : int
        Number of pages to scrape per role
    experience_level : str
        Experience level filter
    days_back : int
        Days to look back for job postings
    
    Returns:
    --------
    list
        List of scraped job dictionaries
    """
    print(f"🔍 Direct Job Scraping for {len(job_roles)} roles")
    print("="*60)
    print(f"📍 Location: {location}")
    print(f"📊 Experience Level: {experience_level or 'Any'}")
    print(f"⏰ Days Back: {days_back}")
    print(f"📄 Pages per role: {pages}")
    print("="*60)
    
    # Initialize scraper
    scraper = LinkedInJobScraper(headless=False)
    all_jobs = []
    
    try:
        for i, role in enumerate(job_roles, 1):
            print(f"\n🔍 [{i}/{len(job_roles)}] Scraping: '{role}'")
            
            jobs = scraper.scrape_jobs(
                job_title=role,
                location=location,
                pages=pages,
                experience_level=experience_level,
                days_back=days_back
            )
            
            if jobs:
                all_jobs.extend(jobs)
                print(f"✅ Found {len(jobs)} jobs for '{role}'")
                
                # Show first few job titles as preview
                for j, job in enumerate(jobs[:3], 1):
                    print(f"   {j}. {job['title']} at {job['company']}")
                if len(jobs) > 3:
                    print(f"   ... and {len(jobs) - 3} more")
            else:
                print(f"⚠️ No jobs found for '{role}'")
            
            # Add delay between searches
            if i < len(job_roles):
                delay = random.choice(list(range(5, 10)))
                print(f"⏳ Waiting {delay} seconds before next search...")
                time.sleep(delay)
        
        print(f"\n🎉 Scraping completed!")
        print(f"📊 Total jobs found: {len(all_jobs)}")
        
    except Exception as e:
        print(f"❌ Error during scraping: {str(e)}")
        logging.error(f"Direct scraping error: {str(e)}")
    
    finally:
        scraper.close()
        print("🔒 Browser closed")
    
    return all_jobs

# Example usage (uncomment to test):
"""
# Test with specific roles
test_roles = ["Data Analyst", "Software Engineer", "Python Developer"]
jobs = scrape_jobs_directly(
    job_roles=test_roles,
    location="India", 
    pages=1,
    experience_level="Junior",
    days_back=7
)

# Save results
if jobs:
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"direct_jobs_{timestamp}.csv"
    save_jobs_to_csv(jobs, filename)
    print(f"💾 Saved {len(jobs)} jobs to {filename}")
"""
