# GenAI Job Finder - Resume to Job Matching System

This notebook implements a comprehensive job matching system that:

1. Loads job data from SQLite database
2. Processes resume files (PDF/DOC)
3. Uses LangChain + Ollama Llama3.2 for semantic matching
4. Incorporates user preferences
5. Returns ranked job recommendations with explanations


## 1. Setup and Imports


In [1]:
# Install required packages (run once)
# !pip install langchain langchain-community langchain-ollama
# !pip install pypdf2 python-docx pandas sqlite3 numpy scikit-learn
# !pip install sentence-transformers chromadb

In [2]:
# Import required libraries
import sqlite3
import pandas as pd
from pathlib import Path
import sys
from datetime import datetime
import json
import re
import numpy as np
from typing import List, Dict, Any, Optional

# Document processing
import PyPDF2
from docx import Document

# ML and similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# LangChain imports
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain.schema import Document as LangChainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings

print("✅ All imports successful")

  from .autonotebook import tqdm as notebook_tqdm


✅ All imports successful


## 2. Database Loading (Your Provided Code)


In [3]:
# Add project root to path
project_root = (
    Path(__file__).parent.parent if "__file__" in globals() else Path.cwd().parent
)
sys.path.append(str(project_root))

try:
    from genai_job_finder.linkedin_parser.database import DatabaseManager
    from genai_job_finder.linkedin_parser.models import Job, JobRun

    print("✅ Successfully imported custom modules")
except ImportError as e:
    print(f"⚠️  Could not import custom modules: {e}")
    print("Will use direct SQLite queries instead")

# Initialize database connection
db_path = project_root / "data" / "jobs.db"
# Alternative path for testing
# db_path = project_root / "test_jobs.db"

print(f"Database path: {db_path}")
print(f"Database exists: {db_path.exists()}")

if not db_path.exists():
    print("❌ Database not found. Please check the path.")
else:
    print("✅ Database found")

✅ Successfully imported custom modules
Database path: /home/alireza/projects/genai_job_finder/data/jobs.db
Database exists: True
✅ Database found


In [4]:
# Load job data from database
def load_jobs_data(db_path: Path) -> pd.DataFrame:
    """Load jobs data from SQLite database"""
    with sqlite3.connect(db_path) as conn:
        # Get the latest job_run created_at timestamp
        latest_run_query = "SELECT MAX(created_at) as latest_run FROM job_runs WHERE status = 'completed'"
        latest_run = pd.read_sql_query(latest_run_query, conn).iloc[0]["latest_run"]

        query = f"""
        SELECT 
            id,
            company,
            company_size,
            company_followers,
            company_industry,
            title,
            location,
            work_location_type,
            level,
            salary_range,
            employment_type,
            job_function,
            industries,
            posted_time,
            applicants,
            job_id,
            date,
            parsing_link,
            job_posting_link,
            company_info_link,
            created_at
        FROM jobs 
        WHERE created_at > '{latest_run}'
        ORDER BY created_at DESC    
        """

        return pd.read_sql_query(query, conn)


# Load the data
if db_path.exists():
    jobs_df = load_jobs_data(db_path)
    # print(f"\n📋 Sample of loaded data:")
    # print(jobs_df[["company", "title", "location", "level", "job_function"]].head())
else:
    print("❌ Cannot load data - database not found")

## 3. Resume Processing Functions


In [5]:
class ResumeProcessor:
    """Process resume files and extract relevant information"""

    def __init__(self, llm):
        self.llm = llm
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200
        )

    def extract_text_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        try:
            with open(file_path, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                return text.strip()
        except Exception as e:
            print(f"Error reading PDF: {e}")
            return ""

    def extract_text_from_docx(self, file_path: str) -> str:
        """Extract text from DOCX file"""
        try:
            doc = Document(file_path)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text.strip()
        except Exception as e:
            print(f"Error reading DOCX: {e}")
            return ""

    def load_resume(self, file_path: str) -> str:
        """Load resume from file (PDF or DOCX)"""
        file_path = Path(file_path)

        if not file_path.exists():
            raise FileNotFoundError(f"Resume file not found: {file_path}")

        if file_path.suffix.lower() == ".pdf":
            return self.extract_text_from_pdf(str(file_path))
        elif file_path.suffix.lower() in [".docx", ".doc"]:
            return self.extract_text_from_docx(str(file_path))
        else:
            raise ValueError(f"Unsupported file format: {file_path.suffix}")

    def extract_resume_features(self, resume_text: str) -> Dict[str, Any]:
        """Extract structured features from resume using LLM"""

        extraction_prompt = PromptTemplate(
            input_variables=["resume_text"],
            template="""
            Analyze the following resume and extract key information in JSON format.
            
            Resume Text:
            {resume_text}
            
            Please extract and return ONLY a valid JSON object with the following structure:
            {{
                "skills": ["list of technical and soft skills"],
                "experience_years": "estimated years of experience as integer",
                "job_titles": ["previous job titles"],
                "industries": ["industries worked in"],
                "education": ["degrees and certifications"],
                "key_achievements": ["notable achievements"],
                "preferred_roles": ["types of roles this person seems suited for"]
            }}
            
            Return only the JSON object, no additional text.
            """,
        )

        try:
            chain = extraction_prompt | self.llm
            response = chain.invoke(
                {"resume_text": resume_text[:4000]}
            )  # Limit text length

            # Clean and parse JSON response
            json_str = response.strip()
            if json_str.startswith("```json"):
                json_str = json_str[7:-3]
            elif json_str.startswith("```"):
                json_str = json_str[3:-3]

            features = json.loads(json_str)
            return features

        except Exception as e:
            print(f"Error extracting features: {e}")
            # Return basic fallback structure
            return {
                "skills": [],
                "experience_years": 0,
                "job_titles": [],
                "industries": [],
                "education": [],
                "key_achievements": [],
                "preferred_roles": [],
            }


# print("✅ Resume processor class defined")

In [6]:
class JobMatcher:
    """Main job matching engine using multiple similarity metrics"""

    def __init__(self, llm, embedding_model_name="all-MiniLM-L6-v2"):
        self.llm = llm
        self.use_sentence_transformer = False
        self.sentence_model = None

        # Check if sentence transformers are available from the setup
        if globals().get("SENTENCE_TRANSFORMER_AVAILABLE", False):
            try:
                import torch

                device = "cuda" if torch.cuda.is_available() else "cpu"

                # Try to initialize sentence transformer with fallback models
                model_names = [
                    embedding_model_name,
                    "paraphrase-MiniLM-L3-v2",
                    "all-MiniLM-L6-v2",
                ]

                for model_name in model_names:
                    try:
                        self.sentence_model = SentenceTransformer(
                            model_name, device=device, cache_folder="./models"
                        )
                        self.use_sentence_transformer = True
                        print(f"✅ Using sentence transformer: {model_name}")
                        break
                    except Exception as e:
                        print(f"⚠️ Failed to load {model_name}: {e}")
                        continue

            except Exception as e:
                print(f"⚠️ Sentence transformer initialization failed: {e}")
                self.use_sentence_transformer = False

        if not self.use_sentence_transformer:
            print("📝 Falling back to enhanced TF-IDF similarity")

        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=1000, stop_words="english", ngram_range=(1, 2)
        )

    def prepare_job_texts(self, jobs_df: pd.DataFrame) -> List[str]:
        """Prepare job descriptions for matching"""
        job_texts = []

        for _, job in jobs_df.iterrows():
            # Combine relevant job information
            text_parts = [
                str(job.get("title", "")),
                str(job.get("company", "")),
                str(job.get("job_function", "")),
                str(job.get("industries", "")),
                str(job.get("level", "")),
                str(job.get("location", "")),
            ]

            job_text = " ".join([part for part in text_parts if part and part != "nan"])
            job_texts.append(job_text)

        return job_texts

    def calculate_semantic_similarity(
        self, resume_text: str, job_texts: List[str]
    ) -> np.ndarray:
        """Calculate semantic similarity using sentence transformers or fallback to enhanced TF-IDF"""
        if self.use_sentence_transformer and self.sentence_model:
            try:
                # Create embeddings
                resume_embedding = self.sentence_model.encode([resume_text])
                job_embeddings = self.sentence_model.encode(job_texts)

                # Calculate cosine similarity
                similarities = cosine_similarity(resume_embedding, job_embeddings)[0]
                return similarities
            except Exception as e:
                print(f"⚠️ Sentence transformer failed, using TF-IDF: {e}")
                self.use_sentence_transformer = False

        # Fallback to enhanced TF-IDF
        return self.calculate_enhanced_tfidf_similarity(resume_text, job_texts)

    def calculate_enhanced_tfidf_similarity(
        self, resume_text: str, job_texts: List[str]
    ) -> np.ndarray:
        """Enhanced TF-IDF similarity with preprocessing"""

        # Preprocess texts
        def preprocess_text(text):
            # Convert to lowercase and remove extra spaces
            text = re.sub(r"\s+", " ", text.lower().strip())
            return text

        resume_text = preprocess_text(resume_text)
        job_texts = [preprocess_text(text) for text in job_texts]

        # Use enhanced TF-IDF
        enhanced_vectorizer = TfidfVectorizer(
            max_features=2000,
            stop_words="english",
            ngram_range=(1, 3),  # Include trigrams for better matching
            min_df=1,
            max_df=0.95,
        )

        all_texts = [resume_text] + job_texts
        tfidf_matrix = enhanced_vectorizer.fit_transform(all_texts)

        # Calculate similarity between resume and jobs
        resume_vector = tfidf_matrix[0:1]
        job_vectors = tfidf_matrix[1:]

        similarities = cosine_similarity(resume_vector, job_vectors)[0]
        return similarities

    def calculate_keyword_similarity(
        self, resume_text: str, job_texts: List[str]
    ) -> np.ndarray:
        """Calculate TF-IDF based keyword similarity"""
        # Fit TF-IDF on all texts
        all_texts = [resume_text] + job_texts
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(all_texts)

        # Calculate similarity between resume and jobs
        resume_vector = tfidf_matrix[0:1]
        job_vectors = tfidf_matrix[1:]

        similarities = cosine_similarity(resume_vector, job_vectors)[0]
        return similarities

    def calculate_preference_score(
        self, job_row: pd.Series, user_preferences: Dict[str, Any]
    ) -> float:
        """Calculate how well job matches user preferences"""
        score = 0.0
        max_score = 0.0

        # Location preference
        if "preferred_locations" in user_preferences:
            max_score += 1.0
            job_location = str(job_row.get("location", "")).lower()
            work_location = str(job_row.get("work_location", "")).lower()

            for pref_loc in user_preferences["preferred_locations"]:
                if (
                    pref_loc.lower() in job_location
                    or pref_loc.lower() in work_location
                ):
                    score += 1.0
                    break

        # Salary preference
        if "salary_range" in user_preferences:
            max_score += 1.0
            salary_range = str(job_row.get("salary_range", ""))
            if salary_range and salary_range != "nan":
                # Simple salary matching logic
                score += 0.5  # Partial credit for having salary info

        # Job function preference
        if "job_functions" in user_preferences:
            max_score += 1.0
            job_function = str(job_row.get("job_function", "")).lower()

            for pref_func in user_preferences["job_functions"]:
                if pref_func.lower() in job_function:
                    score += 1.0
                    break

        # Employment type preference
        if "employment_type" in user_preferences:
            max_score += 1.0
            emp_type = str(job_row.get("employment_type", "")).lower()

            if user_preferences["employment_type"].lower() in emp_type:
                score += 1.0

        return score / max_score if max_score > 0 else 0.0

    def match_jobs(
        self,
        resume_features: Dict[str, Any],
        resume_text: str,
        jobs_df: pd.DataFrame,
        user_preferences: Dict[str, Any],
        top_k: int = 10,
    ) -> pd.DataFrame:
        """Main matching function that combines all similarity metrics"""

        print("🔍 Starting job matching process...")

        # Prepare job texts
        job_texts = self.prepare_job_texts(jobs_df)
        print(f"📝 Prepared {len(job_texts)} job descriptions")

        # Calculate different similarity scores
        if self.use_sentence_transformer:
            print("🧠 Calculating semantic similarities with sentence transformer...")
        else:
            print("🧠 Calculating semantic similarities with enhanced TF-IDF...")
        semantic_scores = self.calculate_semantic_similarity(resume_text, job_texts)

        print("🔤 Calculating keyword similarities...")
        keyword_scores = self.calculate_keyword_similarity(resume_text, job_texts)

        print("⚙️ Calculating preference scores...")
        preference_scores = []
        for _, job_row in jobs_df.iterrows():
            pref_score = self.calculate_preference_score(job_row, user_preferences)
            preference_scores.append(pref_score)
        preference_scores = np.array(preference_scores)

        # Adjust weights based on available methods
        if self.use_sentence_transformer:
            weights = {"semantic": 0.4, "keyword": 0.3, "preference": 0.3}
        else:
            # Give more weight to keyword matching when no sentence transformer
            weights = {
                "semantic": 0.5,  # Enhanced TF-IDF
                "keyword": 0.2,
                "preference": 0.3,
            }

        final_scores = (
            weights["semantic"] * semantic_scores
            + weights["keyword"] * keyword_scores
            + weights["preference"] * preference_scores
        )

        # Add scores to dataframe
        jobs_with_scores = jobs_df.copy()
        jobs_with_scores["semantic_score"] = semantic_scores
        jobs_with_scores["keyword_score"] = keyword_scores
        jobs_with_scores["preference_score"] = preference_scores
        jobs_with_scores["final_score"] = final_scores

        # Sort by final score and return top matches
        top_matches = jobs_with_scores.nlargest(top_k, "final_score")

        method = (
            "sentence transformer"
            if self.use_sentence_transformer
            else "enhanced TF-IDF"
        )
        print(f"✅ Found top {len(top_matches)} job matches using {method}")
        return top_matches


# print("✅ Job matcher class defined")

## 5. Configuration and User Inputs


In [7]:
# Initialize Ollama LLM
print("🚀 Initializing Ollama LLM...")
try:
    llm = OllamaLLM(model="llama3.2", temperature=0.1)
    # Test the connection
    test_response = llm.invoke("Hello, respond with 'OK' if you can hear me.")
    print(f"✅ LLM initialized successfully. Test response: {test_response[:50]}...")
except Exception as e:
    print(f"❌ Error initializing LLM: {e}")
    print("Make sure Ollama is running and llama3.2 model is installed")
    print("Run: ollama pull llama3.2")

🚀 Initializing Ollama LLM...


2025-09-07 22:01:24,133 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


✅ LLM initialized successfully. Test response: OK...


In [8]:
# 📁 RESUME FILE PATH - UPDATE THIS PATH
RESUME_FILE_PATH = "/home/alireza/projects/genai_job_finder/genai_job_finder/data/Ali Zarreh_CV_2025_08_30.docx"  # Change this to your resume file

# 👤 USER PREFERENCES - CUSTOMIZE THESE
USER_PREFERENCES = {
    "preferred_locations": ["San Antonio, TX", "Remote"],
    "salary_range": {"min": 150000, "max": 350000},
    "job_functions": [],  # ["Information Technology", "Engineering", "Data Science"],
    "employment_type": "Full-time",
    "level_preference": ["Mid-Senior level", "Senior level"],
    "must_have_keywords": [],  # ["Python", "Data", "Analytics"],
    "exclude_keywords": [],  # ["Sales", "Marketing"],
}

# 🎯 MATCHING PARAMETERS
TOP_K_MATCHES = 15  # Number of top matches to return

print("⚙️ Configuration set:")
print(f"   Resume file: {RESUME_FILE_PATH}")
print(f"   Preferred locations: {USER_PREFERENCES['preferred_locations']}")
print(f"   Job functions: {USER_PREFERENCES['job_functions']}")
print(f"   Top matches to return: {TOP_K_MATCHES}")

⚙️ Configuration set:
   Resume file: /home/alireza/projects/genai_job_finder/genai_job_finder/data/Ali Zarreh_CV_2025_08_30.docx
   Preferred locations: ['San Antonio, TX', 'Remote']
   Job functions: []
   Top matches to return: 15


## 6. Main Execution Pipeline


In [9]:
# Initialize processors
print("🔧 Initializing processors...")
resume_processor = ResumeProcessor(llm)
job_matcher = JobMatcher(llm)
print("✅ Processors initialized")

# Step 1: Load and process resume
print("\n📄 STEP 1: Processing Resume")
print("=" * 40)

try:
    # For demo purposes, create a sample resume text if file doesn't exist
    if not Path(RESUME_FILE_PATH).exists():
        print(f"⚠️  Resume file not found at {RESUME_FILE_PATH}")
        print("Using sample resume for demonstration...")

        resume_text = """
        John Doe
        Senior Data Scientist
        
        EXPERIENCE:
        - 5+ years in data science and machine learning
        - Expert in Python, SQL, and statistical analysis
        - Experience with cloud platforms (AWS, Azure)
        - Led data analytics projects for Fortune 500 companies
        
        SKILLS:
        - Programming: Python, R, SQL, JavaScript
        - Machine Learning: scikit-learn, TensorFlow, PyTorch
        - Data Visualization: Tableau, Power BI, matplotlib
        - Cloud: AWS, Azure, Google Cloud Platform
        - Databases: PostgreSQL, MongoDB, Redis
        
        EDUCATION:
        - M.S. in Data Science, University of Texas
        - B.S. in Computer Science, Texas A&M University
        
        CERTIFICATIONS:
        - AWS Certified Solutions Architect
        - Google Cloud Professional Data Engineer
        """
    else:
        resume_text = resume_processor.load_resume(RESUME_FILE_PATH)
        print(f"✅ Resume loaded successfully ({len(resume_text)} characters)")

    # Extract resume features
    print("🧠 Extracting resume features using LLM...")
    resume_features = resume_processor.extract_resume_features(resume_text)

    print("✅ Resume features extracted:")
    for key, value in resume_features.items():
        if isinstance(value, list):
            print(
                f"   {key}: {len(value)} items - {value[:3]}{'...' if len(value) > 3 else ''}"
            )
        else:
            print(f"   {key}: {value}")

except Exception as e:
    print(f"❌ Error processing resume: {e}")
    resume_text = ""
    resume_features = {}

🔧 Initializing processors...
📝 Falling back to enhanced TF-IDF similarity
✅ Processors initialized

📄 STEP 1: Processing Resume
✅ Resume loaded successfully (9802 characters)
🧠 Extracting resume features using LLM...


2025-09-07 22:01:24,502 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


✅ Resume features extracted:
   skills: 4 items - ['Advanced expertise in Python, R, SQL, Scala', 'Proficient in employing deep learning frameworks like TensorFlow, Keras, PyTorch', 'Skilled in machine learning algorithms via Scikit-learn, XGBoost, Spark MLlib']...
   experience_years: 3
   job_titles: 1 items - ['Senior Data Scientist']
   industries: 2 items - ['Retail', 'Manufacturing']
   education: 3 items - ['Ph.D. Mechanical and Manufacturing Systems Engineering (Cybersecurity)', 'Master of Science in Mechanical Engineering', 'Bachelor of Science in Mechanical Engineering']
   key_achievements: 2 items - ['Automated credit-card expense classification', 'Built an agentic RAG documentation assistant']
   preferred_roles: 3 items - ['Data Scientist', 'Lead Data Scientist', 'Director of Data Science']


In [10]:
# Step 2: Match jobs
print("\n🎯 STEP 2: Matching Jobs")
print("=" * 40)

if "jobs_df" in locals() and len(jobs_df) > 0 and resume_text:
    try:
        # Perform job matching
        top_matches = job_matcher.match_jobs(
            resume_features=resume_features,
            resume_text=resume_text,
            jobs_df=jobs_df,
            user_preferences=USER_PREFERENCES,
            top_k=TOP_K_MATCHES,
        )

        print(f"\n🏆 TOP {len(top_matches)} JOB MATCHES:")
        print("=" * 50)

        # Display results
        for idx, (_, job) in enumerate(top_matches.iterrows(), 1):
            print(f"\n{idx}. {job['title']} at {job['company']}")
            print(
                f"   📍 Location: {job['location']} ({job.get('work_location', 'N/A')})"
            )
            print(f"   💼 Level: {job.get('level', 'N/A')}")
            print(f"   🏢 Function: {job.get('job_function', 'N/A')}")
            print(f"   💰 Salary: {job.get('salary_range', 'N/A')}")
            print(
                f"   📊 Scores: Final={job['final_score']:.3f} | Semantic={job['semantic_score']:.3f} | Keyword={job['keyword_score']:.3f} | Preference={job['preference_score']:.3f}"
            )

            if idx >= 10:  # Limit display to top 10
                print(f"   ... and {len(top_matches) - 10} more matches")
                break

    except Exception as e:
        print(f"❌ Error during job matching: {e}")
        import traceback

        traceback.print_exc()
else:
    print("❌ Cannot perform matching - missing jobs data or resume text")


🎯 STEP 2: Matching Jobs
🔍 Starting job matching process...
📝 Prepared 20 job descriptions
🧠 Calculating semantic similarities with enhanced TF-IDF...
🔤 Calculating keyword similarities...
⚙️ Calculating preference scores...
✅ Found top 15 job matches using enhanced TF-IDF

🏆 TOP 15 JOB MATCHES:

1. Environmental Data Engineer / Machine Learning Engineer at Grassroots Carbon
   📍 Location: San Antonio, TX (N/A)
   💼 Level: Mid-Senior level
   🏢 Function: Other
   💰 Salary: None
   📊 Scores: Final=0.239 | Semantic=0.065 | Keyword=0.093 | Preference=0.625

2. Principal Machine Learning Engineer, Recommendation Systems at Launch Potato
   📍 Location: San Antonio, TX (N/A)
   💼 Level: Director
   🏢 Function: Engineering and Information Technology
   💰 Salary: $160,000.00/yr - $250,000.00/yr
   📊 Scores: Final=0.238 | Semantic=0.064 | Keyword=0.095 | Preference=0.625

3. Sr Data Scientist - eCommerce at H-E-B
   📍 Location: San Antonio, TX (N/A)
   💼 Level: Mid-Senior level
   🏢 Function: E

In [11]:
# Generate detailed explanations for top matches
def generate_match_explanation(
    job_row: pd.Series, resume_features: Dict[str, Any]
) -> str:
    """Generate detailed explanation for why a job matches"""

    explanation_prompt = PromptTemplate(
        input_variables=["job_info", "resume_features"],
        template="""
        Analyze why this job is a good match for the candidate and provide a detailed explanation.
        
        Job Information:
        - Title: {job_info[title]}
        - Company: {job_info[company]}
        - Location: {job_info[location]}
        - Level: {job_info[level]}
        - Function: {job_info[job_function]}
        - Industry: {job_info[industries]}
        
        Candidate Profile:
        - Skills: {resume_features[skills]}
        - Experience: {resume_features[experience_years]} years
        - Previous Roles: {resume_features[job_titles]}
        - Industries: {resume_features[industries]}
        
        Provide a concise explanation (2-3 sentences) covering:
        1. Key skill alignments
        2. Experience level match
        3. Industry/function relevance
        4. Any potential concerns or gaps
        """,
    )

    try:
        job_info = {
            "title": job_row.get("title", "N/A"),
            "company": job_row.get("company", "N/A"),
            "location": job_row.get("location", "N/A"),
            "level": job_row.get("level", "N/A"),
            "job_function": job_row.get("job_function", "N/A"),
            "industries": job_row.get("industries", "N/A"),
        }

        chain = explanation_prompt | llm
        explanation = chain.invoke(
            {"job_info": job_info, "resume_features": resume_features}
        )

        return explanation.strip()

    except Exception as e:
        return f"Unable to generate explanation: {e}"


# Generate explanations for top 5 matches
if "top_matches" in locals() and len(top_matches) > 0:
    print("\n🔍 DETAILED MATCH ANALYSIS")
    print("=" * 50)

    for idx, (_, job) in enumerate(top_matches.head(5).iterrows(), 1):
        print(f"\n{idx}. {job['title']} at {job['company']}")
        print(f"   Score: {job['final_score']:.3f}")
        print("   Analysis:")

        explanation = generate_match_explanation(job, resume_features)
        # Format explanation with proper indentation
        for line in explanation.split("\n"):
            if line.strip():
                print(f"   {line.strip()}")
        print()


🔍 DETAILED MATCH ANALYSIS

1. Environmental Data Engineer / Machine Learning Engineer at Grassroots Carbon
   Score: 0.239
   Analysis:
   Unable to generate explanation: "Input to PromptTemplate is missing variables {'job_info[job_function]', 'resume_features[industries]', 'resume_features[job_titles]', 'job_info[industries]', 'job_info[location]', 'job_info[title]', 'job_info[company]', 'resume_features[skills]', 'job_info[level]', 'resume_features[experience_years]'}.  Expected: ['job_info[company]', 'job_info[industries]', 'job_info[job_function]', 'job_info[level]', 'job_info[location]', 'job_info[title]', 'resume_features[experience_years]', 'resume_features[industries]', 'resume_features[job_titles]', 'resume_features[skills]'] Received: ['job_info', 'resume_features']\nNote: if you intended {job_info[job_function]} to be part of the string and not a variable, please escape it with double curly braces like: '{{job_info[job_function]}}'.\nFor troubleshooting, visit: https://pyth

In [12]:
# Export results to CSV
if "top_matches" in locals() and len(top_matches) > 0:
    # Prepare export data - check which columns actually exist
    available_columns = top_matches.columns.tolist()

    # Define desired export columns with correct names
    desired_export_columns = [
        "company",
        "title",
        "location",
        "work_location_type",  # Fixed: was 'work_location'
        "level",
        "salary_range",
        "employment_type",
        "job_function",
        "industries",
        "final_score",
        "semantic_score",
        "keyword_score",
        "preference_score",
        "job_posting_link",
        "posted_time",
    ]

    # Only include columns that actually exist in the dataframe
    export_columns = [col for col in desired_export_columns if col in available_columns]

    # Print info about missing columns
    missing_columns = [
        col for col in desired_export_columns if col not in available_columns
    ]
    if missing_columns:
        print(f"⚠️ Missing columns (will be skipped): {missing_columns}")

    export_df = top_matches[export_columns].copy()

    # Round scores for better readability
    score_columns = [
        "final_score",
        "semantic_score",
        "keyword_score",
        "preference_score",
    ]
    for col in score_columns:
        if col in export_df.columns:
            export_df[col] = export_df[col].round(4)

    # Export to CSV
    output_file = f"job_matches_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    export_df.to_csv(output_file, index=False)

    print(f"📊 Results exported to: {output_file}")
    print(f"📈 Summary Statistics:")
    print(f"   Total matches analyzed: {len(jobs_df)}")
    print(f"   Top matches returned: {len(top_matches)}")
    print(f"   Average final score: {top_matches['final_score'].mean():.3f}")
    print(
        f"   Score range: {top_matches['final_score'].min():.3f} - {top_matches['final_score'].max():.3f}"
    )

    # Show distribution by job function
    print(f"\n📋 Top matches by job function:")
    function_counts = top_matches["job_function"].value_counts().head(5)
    for func, count in function_counts.items():
        print(f"   {func}: {count} matches")
else:
    print("❌ No results to export")

📊 Results exported to: job_matches_20250907_220126.csv
📈 Summary Statistics:
   Total matches analyzed: 20
   Top matches returned: 15
   Average final score: 0.213
   Score range: 0.148 - 0.239

📋 Top matches by job function:
   Engineering and Information Technology: 8 matches
   Other: 3 matches
   Information Technology: 3 matches
   Engineering: 1 matches


In [13]:
# Function to refine search based on feedback
def refine_search(
    original_matches: pd.DataFrame, feedback: Dict[str, Any]
) -> pd.DataFrame:
    """Refine job matches based on user feedback"""

    refined_matches = original_matches.copy()

    # Apply additional filters based on feedback
    if "exclude_companies" in feedback:
        for company in feedback["exclude_companies"]:
            refined_matches = refined_matches[
                ~refined_matches["company"].str.contains(company, case=False, na=False)
            ]

    if "focus_on_functions" in feedback:
        function_filter = "|".join(feedback["focus_on_functions"])
        refined_matches = refined_matches[
            refined_matches["job_function"].str.contains(
                function_filter, case=False, na=False
            )
        ]

    if "min_score" in feedback:
        refined_matches = refined_matches[
            refined_matches["final_score"] >= feedback["min_score"]
        ]

    return refined_matches.head(10)


# Example refinement (uncomment and modify as needed)
# REFINEMENT_FEEDBACK = {
#     'exclude_companies': ['Company A', 'Company B'],
#     'focus_on_functions': ['Information Technology', 'Engineering'],
#     'min_score': 0.3
# }

# if 'top_matches' in locals():
#     refined_matches = refine_search(top_matches, REFINEMENT_FEEDBACK)
#     print(f"🔄 Refined search returned {len(refined_matches)} matches")

print("✅ Refinement function ready (uncomment to use)")

✅ Refinement function ready (uncomment to use)


In [14]:
print("\n🎉 JOB MATCHING COMPLETE!")
print("=" * 50)

if "top_matches" in locals() and len(top_matches) > 0:
    print(f"✅ Successfully matched {len(top_matches)} jobs to your resume")
    print(f"📊 Results exported to CSV file")
    print(
        f"🎯 Top match: {top_matches.iloc[0]['title']} at {top_matches.iloc[0]['company']}"
    )
    print(f"📈 Best score: {top_matches.iloc[0]['final_score']:.3f}")

    print(f"\n🚀 NEXT STEPS:")
    print(f"1. Review the detailed match analysis above")
    print(f"2. Check the exported CSV file for complete results")
    print(f"3. Visit job posting links for positions of interest")
    print(f"4. Customize USER_PREFERENCES and re-run for different results")
    print(f"5. Use the refinement section to filter results further")
else:
    print("❌ No matches found. Consider:")
    print("   - Checking your resume file path")
    print("   - Verifying database connection")
    print("   - Adjusting user preferences")
    print("   - Ensuring Ollama is running with llama3.2 model")

print(
    f"\n💡 TIP: Modify the USER_PREFERENCES section and re-run cells 6-8 to get different results!"
)
print(f"🔧 For technical issues, check that Ollama is running: ollama serve")


🎉 JOB MATCHING COMPLETE!
✅ Successfully matched 15 jobs to your resume
📊 Results exported to CSV file
🎯 Top match: Environmental Data Engineer / Machine Learning Engineer at Grassroots Carbon
📈 Best score: 0.239

🚀 NEXT STEPS:
1. Review the detailed match analysis above
2. Check the exported CSV file for complete results
3. Visit job posting links for positions of interest
4. Customize USER_PREFERENCES and re-run for different results
5. Use the refinement section to filter results further

💡 TIP: Modify the USER_PREFERENCES section and re-run cells 6-8 to get different results!
🔧 For technical issues, check that Ollama is running: ollama serve


In [15]:
# Hugging Face Setup and Model Initialization - MUST RUN BEFORE OTHER CELLS
import os
from pathlib import Path

# Load environment variables from .env file if it exists
env_file = Path("../.env")
if env_file.exists():
    from dotenv import load_dotenv

    load_dotenv(env_file)
    print("✅ Loaded environment variables from .env file")
else:
    print("⚠️ No .env file found, using default settings")

# Check if we should skip sentence transformers entirely
skip_sentence_transformers = (
    os.getenv("SKIP_SENTENCE_TRANSFORMERS", "false").lower() == "true"
)

if skip_sentence_transformers:
    print("🚫 SKIP_SENTENCE_TRANSFORMERS=true - Skipping sentence transformer setup")
    sentence_transformer_available = False
else:
    # Check if we have a Hugging Face token
    hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
    if hf_token and hf_token.startswith("hf_"):
        print("✅ Hugging Face token found, configuring for authenticated access")
        # Set the token for huggingface-hub
        os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token  # Alternative env var name

        # Enable online mode with authentication
        os.environ["HF_HUB_OFFLINE"] = "0"
        os.environ["TRANSFORMERS_OFFLINE"] = "0"
        os.environ["HF_DATASETS_OFFLINE"] = "0"
    else:
        print("⚠️ No Hugging Face token found, forcing offline mode")
        os.environ["HF_HUB_OFFLINE"] = "1"
        os.environ["TRANSFORMERS_OFFLINE"] = "1"
        os.environ["HF_DATASETS_OFFLINE"] = "1"

    # Always disable telemetry
    os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

    # Set local cache directories
    models_dir = Path("./models")
    models_dir.mkdir(exist_ok=True)
    os.environ["TRANSFORMERS_CACHE"] = str(models_dir / "transformers_cache")
    os.environ["HF_HOME"] = str(models_dir / "hf_cache")
    os.environ["SENTENCE_TRANSFORMERS_HOME"] = str(models_dir / "sentence_transformers")

    # Suppress transformer warnings
    try:
        from transformers import logging as transformers_logging

        transformers_logging.set_verbosity_error()
    except ImportError:
        pass

    print("🔧 Hugging Face environment configured")

    # Test sentence transformer availability
    sentence_transformer_available = False

    try:
        import torch

        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"🖥️ PyTorch device: {device}")

        # Configure huggingface_hub with token
        if hf_token:
            try:
                from huggingface_hub import login

                login(token=hf_token, add_to_git_credential=False)
                print("✅ Logged in to Hugging Face Hub")
            except Exception as e:
                print(f"⚠️ Could not login to HF Hub: {e}")

        from sentence_transformers import SentenceTransformer

        print("🔄 Loading sentence transformer model...")

        # Try loading with explicit cache directory and simpler model name
        try:
            # Use just the model name without organization prefix
            model = SentenceTransformer(
                "all-MiniLM-L6-v2",
                device=device,
                cache_folder=str(models_dir / "sentence_transformers"),
            )

            # Test the model
            test_embedding = model.encode("test sentence", show_progress_bar=False)
            if test_embedding is not None and len(test_embedding) > 0:
                sentence_transformer_available = True
                print(
                    f"✅ Sentence transformer loaded successfully! Embedding size: {len(test_embedding)}"
                )

            del model  # Free memory

        except Exception as e:
            error_msg = str(e)
            if "connection error" in error_msg.lower():
                print("🌐 Connection error - check firewall/proxy settings")
            elif "401" in error_msg or "unauthorized" in error_msg.lower():
                print("🔑 Authentication error - check HF token validity")
            else:
                print(f"⚠️ Failed to load model: {error_msg[:200]}...")
            sentence_transformer_available = False

    except ImportError as e:
        print(f"⚠️ Missing dependency: {e}")
        print("💡 Install with: pip install sentence-transformers")
        sentence_transformer_available = False
    except Exception as e:
        print(f"⚠️ Unexpected error: {e}")
        sentence_transformer_available = False

# Final status
if sentence_transformer_available:
    print("\n🎉 SUCCESS! Sentence transformers are ready to use!")
else:
    print("\n📊 Using TF-IDF similarity (works great for job matching!)")
    print("\n💡 To use sentence transformers, try ONE of these:")
    print("   1. Set SKIP_SENTENCE_TRANSFORMERS=true in .env (recommended)")
    print(
        "   2. Run in terminal: pip install --upgrade sentence-transformers transformers huggingface-hub"
    )
    print("   3. Check proxy/firewall settings if behind corporate network")
    print(
        "\n✨ TF-IDF similarity is highly effective for job matching, so don't worry!"
    )

# Store the availability status for other cells
globals()["SENTENCE_TRANSFORMER_AVAILABLE"] = sentence_transformer_available

✅ Loaded environment variables from .env file
✅ Hugging Face token found, configuring for authenticated access
🔧 Hugging Face environment configured
🖥️ PyTorch device: cuda
🖥️ PyTorch device: cuda


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
2025-09-07 22:01:27,323 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-09-07 22:01:27,323 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


✅ Logged in to Hugging Face Hub
🔄 Loading sentence transformer model...
🌐 Connection error - check firewall/proxy settings

📊 Using TF-IDF similarity (works great for job matching!)

💡 To use sentence transformers, try ONE of these:
   1. Set SKIP_SENTENCE_TRANSFORMERS=true in .env (recommended)
   2. Run in terminal: pip install --upgrade sentence-transformers transformers huggingface-hub
   3. Check proxy/firewall settings if behind corporate network

✨ TF-IDF similarity is highly effective for job matching, so don't worry!
🌐 Connection error - check firewall/proxy settings

📊 Using TF-IDF similarity (works great for job matching!)

💡 To use sentence transformers, try ONE of these:
   1. Set SKIP_SENTENCE_TRANSFORMERS=true in .env (recommended)
   2. Run in terminal: pip install --upgrade sentence-transformers transformers huggingface-hub
   3. Check proxy/firewall settings if behind corporate network

✨ TF-IDF similarity is highly effective for job matching, so don't worry!


In [16]:
# Install python-dotenv if not available (run once)
try:
    import dotenv

    print("✅ python-dotenv already installed")
except ImportError:
    print("📦 Installing python-dotenv...")
    import subprocess
    import sys

    subprocess.check_call([sys.executable, "-m", "pip", "install", "python-dotenv"])
    print("✅ python-dotenv installed successfully")

✅ python-dotenv already installed


In [17]:
# Manual Model Download (run this if automatic loading fails)
import subprocess
import sys
from pathlib import Path


def download_sentence_transformer_model(model_name: str, local_dir: Path):
    """Manually download a sentence transformer model"""
    try:
        print(f"📥 Downloading {model_name} to {local_dir}...")

        # Create directory
        local_dir.mkdir(parents=True, exist_ok=True)

        # Download using huggingface-cli
        cmd = [
            sys.executable,
            "-m",
            "huggingface_hub.commands.huggingface_cli",
            "download",
            model_name,
            "--local-dir",
            str(local_dir),
            "--local-dir-use-symlinks",
            "False",
        ]

        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)

        if result.returncode == 0:
            print(f"✅ Successfully downloaded {model_name}")
            return True
        else:
            print(f"❌ Failed to download {model_name}: {result.stderr}")
            return False

    except Exception as e:
        print(f"❌ Error downloading {model_name}: {e}")
        return False


# Uncomment and run this section if you want to manually download models
# models_to_download = [
#     ("sentence-transformers/all-MiniLM-L6-v2", "./models/all-MiniLM-L6-v2"),
#     ("sentence-transformers/paraphrase-MiniLM-L3-v2", "./models/paraphrase-MiniLM-L3-v2")
# ]

# for model_name, local_path in models_to_download:
#     download_sentence_transformer_model(model_name, Path(local_path))

print("💡 Uncomment the code above if you want to manually download models")

💡 Uncomment the code above if you want to manually download models


In [20]:
# Check Internet Connection and Hugging Face Access
import socket
import urllib.request
import urllib.error
import ssl


def check_internet_connection():
    """Check internet connectivity and Hugging Face access"""

    # Test general internet connectivity
    test_sites = [
        ("Google DNS", "8.8.8.8", 53),
        ("Cloudflare DNS", "1.1.1.1", 53),
    ]

    internet_available = False
    for name, host, port in test_sites:
        try:
            socket.create_connection((host, port), timeout=5)
            print(f"✅ {name} reachable - Internet connection available")
            internet_available = True
            break
        except:
            continue

    if not internet_available:
        print("❌ No internet connection detected")
        return False

    # Test HTTPS connectivity to specific sites
    test_urls = [
        ("Hugging Face Hub", "https://huggingface.co"),
        ("Hugging Face API", "https://api-inference.huggingface.co"),
        ("GitHub", "https://github.com"),
    ]

    # Create SSL context
    ssl_context = ssl.create_default_context()

    for name, url in test_urls:
        try:
            req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
            with urllib.request.urlopen(
                req, timeout=10, context=ssl_context
            ) as response:
                if response.status == 200:
                    print(f"✅ {name} accessible ({url})")
                else:
                    print(f"⚠️ {name} returned status {response.status}")
        except urllib.error.HTTPError as e:
            if e.code == 401:
                print(f"🔑 {name} requires authentication (401)")
            else:
                print(f"❌ {name} HTTP error: {e.code}")
        except urllib.error.URLError as e:
            print(f"❌ {name} connection error: {e.reason}")
        except Exception as e:
            print(f"❌ {name} error: {type(e).__name__}: {str(e)[:100]}")

    # Test with Hugging Face token if available
    import os

    hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
    if hf_token and hf_token.startswith("hf_"):
        print("\n🔑 Testing Hugging Face API with token...")
        try:
            req = urllib.request.Request(
                "https://huggingface.co/api/models/sentence-transformers/all-MiniLM-L6-v2",
                headers={
                    "Authorization": f"Bearer {hf_token}",
                    "User-Agent": "Mozilla/5.0",
                },
            )
            with urllib.request.urlopen(
                req, timeout=10, context=ssl_context
            ) as response:
                if response.status == 200:
                    print("✅ Hugging Face API accessible with token")
                    return True
        except urllib.error.HTTPError as e:
            print(f"❌ Hugging Face API error with token: {e.code} - {e.reason}")
        except Exception as e:
            print(f"❌ Failed to access Hugging Face API: {e}")

    return internet_available


# Run the connection check
print("🌐 Checking Internet Connectivity...")
print("=" * 50)
connection_ok = check_internet_connection()
print("=" * 50)

if not connection_ok:
    print("\n💡 Connection Issues Detected!")
    print("   1. Check your internet connection")
    print("   2. Check if you're behind a proxy/firewall")
    print("   3. Try: export SKIP_SENTENCE_TRANSFORMERS=true")
else:
    print("\n✅ Internet connection seems OK")
    print("   If sentence transformers still fail, try:")
    print("   1. Verify your HF token is valid")
    print("   2. Set SKIP_SENTENCE_TRANSFORMERS=true in .env")

🌐 Checking Internet Connectivity...
✅ Google DNS reachable - Internet connection available
✅ Hugging Face Hub accessible (https://huggingface.co)
✅ Hugging Face Hub accessible (https://huggingface.co)
🔑 Hugging Face API requires authentication (401)
🔑 Hugging Face API requires authentication (401)
✅ GitHub accessible (https://github.com)

🔑 Testing Hugging Face API with token...
✅ GitHub accessible (https://github.com)

🔑 Testing Hugging Face API with token...
✅ Hugging Face API accessible with token

✅ Internet connection seems OK
   If sentence transformers still fail, try:
   1. Verify your HF token is valid
   2. Set SKIP_SENTENCE_TRANSFORMERS=true in .env
✅ Hugging Face API accessible with token

✅ Internet connection seems OK
   If sentence transformers still fail, try:
   1. Verify your HF token is valid
   2. Set SKIP_SENTENCE_TRANSFORMERS=true in .env
