In [7]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# --- CONFIGURATION ---
# 1. Path to your Resume folders (PDFs)
PATH_RESUMES = "archive(1)/data/data"

# 2. Path to your Job Description folder (CSV files)
PATH_JOBS = "jobdesc/home/sdf"

OUTPUT_FILE = "processed_data.pkl"

In [8]:
MODEL_NAME = 'all-MiniLM-L6-v2'

def clean_text(text):
    if not text: return ""
    text = str(text).replace("\n", " ").replace("\r", " ").replace("\t", " ")
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    return re.sub(' +', ' ', text).strip()

def load_resumes(base_path):
    print(f"--- Scanning Resumes in {base_path} ---")
    data = []
    total_files = 0
    
    # Walk through all folders (Accountant, Engineering, etc.)
    for root, dirs, files in os.walk(base_path):
        category = os.path.basename(root)
        if root == base_path: continue
        
        pdf_files = [f for f in files if f.lower().endswith(".pdf")]
        
        for file in pdf_files:
            try:
                # Read PDF
                full_path = os.path.join(root, file)
                with fitz.open(full_path) as doc:
                    text = "".join([page.get_text() for page in doc])
                
                # Only save if it has text
                if len(text) > 50:
                    data.append({
                        "Filename": file, 
                        "Category": category, 
                        "Text": clean_text(text),
                        "Path": full_path
                    })
                    total_files += 1
                    
                    # Print progress every 100 files
                    if total_files % 100 == 0:
                        print(f"Processed {total_files} resumes...")
            except Exception as e:
                print(f"Error reading {file}: {e}")
                
    print(f"Done! Loaded {len(data)} total resumes.")
    return pd.DataFrame(data)

def load_jobs(base_path):
    print(f"--- Scanning Jobs in {base_path} ---")
    all_jobs = []
    if os.path.exists(base_path):
        csv_files = [f for f in os.listdir(base_path) if f.lower().endswith('.csv')]
        for file in csv_files:
            try:
                df = pd.read_csv(os.path.join(base_path, file)).fillna("")
                
                # Auto-detect columns
                text_col = next((c for c in df.columns if 'description' in c.lower()), None)
                title_col = next((c for c in df.columns if 'title' in c.lower()), None)
                
                if text_col:
                    temp = pd.DataFrame()
                    temp['Job_Title'] = df[title_col] if title_col else "Job " + df.index.astype(str)
                    temp['Job_Text'] = df[text_col].apply(clean_text)
                    all_jobs.append(temp)
                    print(f"Loaded jobs from {file}")
            except: pass
            
    return pd.concat(all_jobs, ignore_index=True) if all_jobs else pd.DataFrame()

In [9]:
def load_job_descriptions(base_path):
    """
    Scans the folder for CSV files and loads Job Descriptions.
    It attempts to find columns like 'Job Description', 'Description', or 'Job Title'.
    """
    if not os.path.exists(base_path):
        print(f"Warning: Job Desc folder '{base_path}' not found.")
        return pd.DataFrame()

    print(f"--- Scanning Job Descriptions in {base_path} ---")
    all_jobs = []

    # Find all CSV files in the folder
    csv_files = [f for f in os.listdir(base_path) if f.lower().endswith('.csv')]
    
    for file in csv_files:
        file_path = os.path.join(base_path, file)
        try:
            df = pd.read_csv(file_path)
            print(f"Loaded {file} with columns: {list(df.columns)}")
            
            # --- INTELLIGENT COLUMN DETECTION ---
            # We need to find which column contains the text we want to match.
            # We look for common names.
            text_col = None
            title_col = None
            
            # 1. Find the Text Column
            candidates_text = ['Job Description', 'Description', 'job_description', 'description', 'Job Text']
            for col in candidates_text:
                if col in df.columns:
                    text_col = col
                    break
            
            # 2. Find the Title Column (Optional, for display)
            candidates_title = ['Job Title', 'Title', 'job_title', 'position']
            for col in candidates_title:
                if col in df.columns:
                    title_col = col
                    break
            
            # If we found a text column, clean it and add to our list
            if text_col:
                # Fill NaNs
                df[text_col] = df[text_col].fillna("")
                if title_col:
                    df[title_col] = df[title_col].fillna("Unknown Role")
                
                # Standardize DataFrame
                temp_df = pd.DataFrame()
                temp_df['Job_Text'] = df[text_col].apply(clean_text)
                
                # Create a display title: "Software Engineer (ID: 1)"
                if title_col:
                    temp_df['Job_Title'] = df[title_col]
                else:
                    temp_df['Job_Title'] = "Job Role " + temp_df.index.astype(str)
                
                all_jobs.append(temp_df)
                
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if all_jobs:
        return pd.concat(all_jobs, ignore_index=True)
    else:
        return pd.DataFrame()

In [4]:
def run_semantic_matching(resumes_df, jobs_df):
    
    # 1. Load Model
    print("\nLoading SBERT Model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # 2. Embed Resumes
    print(f"Encoding {len(resumes_df)} Resumes...")
    resume_embeddings = model.encode(resumes_df['Resume_Text'].tolist(), show_progress_bar=True)
    
    # 3. Embed Job Descriptions
    print(f"Encoding {len(jobs_df)} Job Descriptions...")
    job_embeddings = model.encode(jobs_df['Job_Text'].tolist(), show_progress_bar=True)
    
    # 4. Calculate Similarity Matrix
    # Result is a Matrix of shape (Num_Jobs, Num_Resumes)
    print("Calculating Cosine Similarity Matrix...")
    similarity_matrix = cosine_similarity(job_embeddings, resume_embeddings)
    
    return similarity_matrix

In [6]:
if __name__ == "__main__":
    
    # 1. LOAD DATA
    print("Loading Resumes...")
    df_resumes = load_resumes(PATH_RESUMES)
    
    print("Loading Job Descriptions...")
    df_jobs = load_job_descriptions(PATH_JOBS)
    
    # --- SAFETY CHECK: LIMIT DATA SIZE ---
    # 30,000 jobs is too many for a quick test. 
    # Let's slice it to the first 50 jobs to ensure the code works first.
    if len(df_jobs) > 50:
        print(f"\n[INFO] Dataset is large ({len(df_jobs)} jobs). Using first 50 for testing.")
        df_jobs = df_jobs.head(50)
    # -------------------------------------
    
    if df_resumes.empty or df_jobs.empty:
        print("Error: One of the datasets is empty. Please check paths.")
    else:
        # 2. RUN MATCHING
        # This will now compare (50 Jobs) x (All Resumes)
        sim_matrix = run_semantic_matching(df_resumes, df_jobs)
        
        # 3. DISPLAY TOP MATCHES
        print("\n" + "="*80)
        print(" MATCHING RESULTS ")
        print("="*80)
        
        # Show top matches for the first 5 jobs
        num_jobs_to_show = min(5, len(df_jobs))
        
        for job_idx in range(num_jobs_to_show):
            # Safe access to title (handles cases where title might be missing)
            job_title = df_jobs.iloc[job_idx]['Job_Title']
            print(f"\nJOB: {job_title}")
            print("-" * 40)
            
            # Get scores for this specific job
            job_scores = sim_matrix[job_idx]
            
            # Sort indices by score (Descending)
            # Get top 3 best resumes for this job
            top_indices = np.argsort(job_scores)[::-1][:3] 
            
            for rank, resume_idx in enumerate(top_indices):
                score = job_scores[resume_idx]
                resume_name = df_resumes.iloc[resume_idx]['Resume_ID']
                category = df_resumes.iloc[resume_idx]['Category']
                
                print(f"  Rank {rank+1}: {score*100:.1f}% Match | {category} | {resume_name}")

        # Optional: Save ALL results to a CSV
        print(f"\nSaving results for {len(df_jobs)} jobs to 'final_matching_results.csv'...")
        
        # Create a list to store all match rows
        results_data = []
        for i in range(len(df_jobs)):
            job_title = df_jobs.iloc[i]['Job_Title']
            job_scores = sim_matrix[i]
            best_resume_idx = np.argmax(job_scores)
            best_score = job_scores[best_resume_idx]
            best_resume_name = df_resumes.iloc[best_resume_idx]['Resume_ID']
            
            results_data.append({
                "Job Title": job_title,
                "Best Resume": best_resume_name,
                "Match Score": best_score
            })
        
        pd.DataFrame(results_data).to_csv("final_matching_results.csv", index=False)
        print("Done.")

Loading Resumes...
--- Scanning Resumes in archive(1)/data/data ---
Total Resumes Loaded: 2483
Loading Job Descriptions...
--- Scanning Job Descriptions in jobdesc/home/sdf ---
Loaded marketing_sample_for_trulia_com-real_estate__20190901_20191031__30k_data.csv with columns: ['Job Title', 'Job Description', 'Job Type', 'Categories', 'Location', 'City', 'State', 'Country', 'Zip Code', 'Address', 'Salary From', 'Salary To', 'Salary Period', 'Apply Url', 'Apply Email', 'Employees', 'Industry', 'Company Name', 'Employer Email', 'Employer Website', 'Employer Phone', 'Employer Logo', 'Companydescription', 'Employer Location', 'Employer City', 'Employer State', 'Employer Country', 'Employer Zip Code', 'Uniq Id', 'Crawl Timestamp']

[INFO] Dataset is large (30002 jobs). Using first 50 for testing.

Loading SBERT Model...
Encoding 2483 Resumes...


Batches: 100%|██████████| 78/78 [00:40<00:00,  1.91it/s]


Encoding 50 Job Descriptions...


Batches: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s]

Calculating Cosine Similarity Matrix...

 MATCHING RESULTS 

JOB: Shift Manager
----------------------------------------
  Rank 1: 61.6% Match | SALES | 92200491.pdf
  Rank 2: 61.5% Match | CHEF | 21334981.pdf
  Rank 3: 59.3% Match | CHEF | 13212436.pdf

JOB: Operations Support Manager
----------------------------------------
  Rank 1: 69.3% Match | CHEF | 13212436.pdf
  Rank 2: 67.1% Match | CHEF | 21334981.pdf
  Rank 3: 64.6% Match | CHEF | 11444536.pdf

JOB: Senior Product Manager - Data
----------------------------------------
  Rank 1: 62.3% Match | DIGITAL-MEDIA | 62700506.pdf
  Rank 2: 61.5% Match | DIGITAL-MEDIA | 27080812.pdf
  Rank 3: 59.5% Match | DIGITAL-MEDIA | 28679359.pdf

JOB: Part-Time Office Concierge
----------------------------------------
  Rank 1: 57.3% Match | BANKING | 16407619.pdf
  Rank 2: 54.9% Match | SALES | 30529547.pdf
  Rank 3: 54.9% Match | HR | 16852973.pdf

JOB: Print & Marketing Associate
----------------------------------------
  Rank 1: 60.2% Match




In [11]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer

# --- CONFIGURATION ---
PATH_RESUMES = "archive(1)/data/data"
PATH_JOBS = "jobdesc/home/sdf"
OUTPUT_FILE = "processed_data.pkl"

# Use the fast model
MODEL_NAME = 'all-MiniLM-L6-v2'

def clean_text(text):
    if not text: return ""
    text = str(text).replace("\n", " ").replace("\r", " ").replace("\t", " ")
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    return re.sub(' +', ' ', text).strip()

def load_resumes(base_path):
    print(f"--- Scanning Resumes in {base_path} ---")
    data = []
    total_files = 0
    
    # Walk through all folders (Accountant, Engineering, etc.)
    for root, dirs, files in os.walk(base_path):
        category = os.path.basename(root)
        if root == base_path: continue
        
        pdf_files = [f for f in files if f.lower().endswith(".pdf")]
        
        for file in pdf_files:
            try:
                # Read PDF
                full_path = os.path.join(root, file)
                with fitz.open(full_path) as doc:
                    text = "".join([page.get_text() for page in doc])
                
                # Only save if it has text
                if len(text) > 50:
                    data.append({
                        "Filename": file, 
                        "Category": category, 
                        "Text": clean_text(text),
                        "Path": full_path
                    })
                    total_files += 1
                    
                    if total_files % 100 == 0:
                        print(f"Processed {total_files} resumes...")
            except Exception as e:
                print(f"Error reading {file}: {e}")
                
    print(f"Done! Loaded {len(data)} total resumes.")
    return pd.DataFrame(data)

# --- YOUR CUSTOM FUNCTION HERE ---
def load_job_descriptions(base_path):
    """
    Scans the folder for CSV files and loads Job Descriptions.
    It attempts to find columns like 'Job Description', 'Description', or 'Job Title'.
    """
    if not os.path.exists(base_path):
        print(f"Warning: Job Desc folder '{base_path}' not found.")
        return pd.DataFrame()

    print(f"--- Scanning Job Descriptions in {base_path} ---")
    all_jobs = []

    # Find all CSV files in the folder
    csv_files = [f for f in os.listdir(base_path) if f.lower().endswith('.csv')]
    
    for file in csv_files:
        file_path = os.path.join(base_path, file)
        try:
            df = pd.read_csv(file_path)
            print(f"Loaded {file} with columns: {list(df.columns)}")
            
            # --- INTELLIGENT COLUMN DETECTION ---
            text_col = None
            title_col = None
            
            # 1. Find the Text Column
            candidates_text = ['Job Description', 'Description', 'job_description', 'description', 'Job Text']
            for col in candidates_text:
                if col in df.columns:
                    text_col = col
                    break
            
            # 2. Find the Title Column (Optional, for display)
            candidates_title = ['Job Title', 'Title', 'job_title', 'position']
            for col in candidates_title:
                if col in df.columns:
                    title_col = col
                    break
            
            # If we found a text column, clean it and add to our list
            if text_col:
                # Fill NaNs
                df[text_col] = df[text_col].fillna("")
                if title_col:
                    df[title_col] = df[title_col].fillna("Unknown Role")
                
                # Standardize DataFrame
                temp_df = pd.DataFrame()
                temp_df['Job_Text'] = df[text_col].apply(clean_text)
                
                # Create a display title: "Software Engineer (ID: 1)"
                if title_col:
                    temp_df['Job_Title'] = df[title_col]
                else:
                    temp_df['Job_Title'] = "Job Role " + temp_df.index.astype(str)
                
                all_jobs.append(temp_df)
                
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if all_jobs:
        full_df = pd.concat(all_jobs, ignore_index=True)
        return full_df.head(100) # <--- Keep this for a fast, smooth demo
    else:
        return pd.DataFrame()

if __name__ == "__main__":
    print("STARTING DATA PROCESSING ENGINE")
    print("="*40)
    
    # 1. Load All Text Data
    df_resumes = load_resumes(PATH_RESUMES)
    
    # CALLING YOUR FUNCTION HERE
    df_jobs = load_job_descriptions(PATH_JOBS)
    
    if df_resumes.empty:
        print("CRITICAL ERROR: No resumes found! Check your path.")
        exit()

    # 2. Pre-Calculate Math (Embeddings)
    print("\nLoading AI Model (this happens only once)...")
    model = SentenceTransformer(MODEL_NAME)
    
    print(f"Calculating vectors for {len(df_resumes)} resumes...")
    resume_vectors = model.encode(df_resumes['Text'].tolist(), show_progress_bar=True)
    
    print(f"Calculating vectors for {len(df_jobs)} jobs...")
    # Check if jobs were found before encoding
    if not df_jobs.empty:
        job_vectors = model.encode(df_jobs['Job_Text'].tolist(), show_progress_bar=True)
    else:
        print("Warning: No jobs found to encode. App will only work for custom pasted jobs.")
        job_vectors = []
    
    # 3. Save to Disk
    print(f"\nSaving data to '{OUTPUT_FILE}'...")
    with open(OUTPUT_FILE, 'wb') as f:
        pickle.dump({
            "resumes": df_resumes,
            "resume_vectors": resume_vectors,
            "jobs": df_jobs,
            "job_vectors": job_vectors
        }, f)
        
    print("\nSUCCESS! You never have to run this script again.")
    print(f"Run 'streamlit run app.py' now to see the instant app.")

STARTING DATA PROCESSING ENGINE
--- Scanning Resumes in archive(1)/data/data ---
Processed 100 resumes...
Processed 200 resumes...
Processed 300 resumes...
Processed 400 resumes...
Processed 500 resumes...
Processed 600 resumes...
Processed 700 resumes...
Processed 800 resumes...
Processed 900 resumes...
Processed 1000 resumes...
Processed 1100 resumes...
Processed 1200 resumes...
Processed 1300 resumes...
Processed 1400 resumes...
Processed 1500 resumes...
Processed 1600 resumes...
Processed 1700 resumes...
Processed 1800 resumes...
Processed 1900 resumes...
Processed 2000 resumes...
Processed 2100 resumes...
Processed 2200 resumes...
Processed 2300 resumes...
Processed 2400 resumes...
Done! Loaded 2483 total resumes.
--- Scanning Job Descriptions in jobdesc/home/sdf ---
Loaded marketing_sample_for_trulia_com-real_estate__20190901_20191031__30k_data.csv with columns: ['Job Title', 'Job Description', 'Job Type', 'Categories', 'Location', 'City', 'State', 'Country', 'Zip Code', 'Address

Batches: 100%|██████████| 78/78 [00:49<00:00,  1.56it/s]


Calculating vectors for 100 jobs...


Batches: 100%|██████████| 4/4 [00:01<00:00,  2.38it/s]


Saving data to 'processed_data.pkl'...

SUCCESS! You never have to run this script again.
Run 'streamlit run app.py' now to see the instant app.





In [19]:
def calculate_accuracy_detailed(resume_vectors, job_vectors, df_resumes, df_jobs):
    print("\n--- DETAILED ACCURACY BREAKDOWN ---")
    
    similarity_matrix = cosine_similarity(resume_vectors, job_vectors)
    
    # Use the same map from before
    category_map = {
        "information-technology": ["it", "tech", "software", "developer", "data", "analyst"],
        "business-development": ["business", "development", "manager", "sales", "growth"],
        "accountant": ["account", "tax", "audit", "finance", "cpa"],
        "advocate": ["legal", "law", "attorney", "counsel"],
        "chef": ["chef", "cook", "culinary", "kitchen"],
        "engineering": ["engineer", "mechanical", "electrical"],
        "finance": ["finance", "banking", "investment"],
        "aviation": ["pilot", "aviation", "flight", "aircraft"],
        "fitness": ["fitness", "gym", "trainer", "coach"],
        "sales": ["sales", "account executive", "rep"],
        "healthcare": ["health", "medical", "nurse", "doctor"],
        "consultant": ["consultant", "advisor"],
        "banking": ["bank", "lending", "credit"],
        "construction": ["construction", "site", "project manager"],
        "public-relations": ["pr", "public relations", "media"],
        "hr": ["human resources", "recruiter", "talent"],
        "designer": ["design", "creative", "art", "graphic", "ux", "ui"],
        "arts": ["art", "creative", "design", "gallery"]
    }

    category_stats = {} # Store correct/total per category

    for i in range(len(df_resumes)):
        cat = str(df_resumes.iloc[i]['Category']).strip()
        cat_key = cat.lower()
        
        if cat not in category_stats:
            category_stats[cat] = {"total": 0, "correct": 0}
            
        category_stats[cat]["total"] += 1
        
        # Check Top 3 Matches
        top_3_indices = np.argsort(similarity_matrix[i])[-3:][::-1]
        
        found_match = False
        for idx in top_3_indices:
            pred_title = str(df_jobs.iloc[idx]['Job_Title']).lower()
            
            if cat_key in pred_title:
                found_match = True
            elif cat_key in category_map:
                for keyword in category_map[cat_key]:
                    if keyword in pred_title:
                        found_match = True
            if found_match: break
        
        if found_match:
            category_stats[cat]["correct"] += 1

    # Print Report
    print(f"{'CATEGORY':<25} | {'ACCURACY':<10} | {'COUNT'}")
    print("-" * 50)
    
    total_correct = 0
    total_count = 0
    
    for cat, stats in category_stats.items():
        acc = (stats["correct"] / stats["total"]) * 100
        print(f"{cat:<25} | {acc:6.2f}%    | {stats['total']}/{stats['total']}")
        
        total_correct += stats["correct"]
        total_count += stats["total"]
        
    print("-" * 50)
    print(f"OVERALL ACCURACY: {(total_correct/total_count)*100:.2f}%")

# Run it
calculate_accuracy_detailed(resume_vectors, job_vectors, df_resumes, df_jobs)


--- DETAILED ACCURACY BREAKDOWN ---
CATEGORY                  | ACCURACY   | COUNT
--------------------------------------------------
ACCOUNTANT                |  63.56%    | 118/118
ADVOCATE                  |   0.00%    | 118/118
AGRICULTURE               |   0.00%    | 63/63
APPAREL                   |   0.00%    | 97/97
ARTS                      |   5.83%    | 103/103
AUTOMOBILE                |   0.00%    | 36/36
AVIATION                  |  25.64%    | 117/117
BANKING                   |  57.39%    | 115/115
BPO                       |   0.00%    | 22/22
BUSINESS-DEVELOPMENT      |  99.16%    | 119/119
CHEF                      |   0.00%    | 118/118
CONSTRUCTION              |   1.79%    | 112/112
CONSULTANT                |  26.09%    | 115/115
DESIGNER                  |   0.00%    | 107/107
DIGITAL-MEDIA             |   0.00%    | 96/96
ENGINEERING               |  76.27%    | 118/118
FINANCE                   |   5.93%    | 118/118
FITNESS                   |   0.00%    | 1

In [18]:
if __name__ == "__main__":
    print("STARTING DATA PROCESSING ENGINE")
    print("="*40)
    
    # 1. Load All Text Data
    df_resumes = load_resumes(PATH_RESUMES)
    df_jobs = load_job_descriptions(PATH_JOBS)
    
    if df_resumes.empty:
        print("CRITICAL ERROR: No resumes found! Check your path.")
        exit()

    # 2. Pre-Calculate Math (Embeddings)
    print("\nLoading AI Model (this happens only once)...")
    model = SentenceTransformer(MODEL_NAME)
    
    print(f"Calculating vectors for {len(df_resumes)} resumes...")
    resume_vectors = model.encode(df_resumes['Text'].tolist(), show_progress_bar=True)
    
    print(f"Calculating vectors for {len(df_jobs)} jobs...")
    if not df_jobs.empty:
        job_vectors = model.encode(df_jobs['Job_Text'].tolist(), show_progress_bar=True)
        
        # --- HERE IS HOW YOU CALL IT ---
        calculate_accuracy(resume_vectors, job_vectors, df_resumes, df_jobs)
        # -------------------------------
        
    else:
        print("Warning: No jobs found. Skipping accuracy check.")
        job_vectors = []
    
    # 3. Save to Disk
    print(f"\nSaving data to '{OUTPUT_FILE}'...")
    with open(OUTPUT_FILE, 'wb') as f:
        pickle.dump({
            "resumes": df_resumes,
            "resume_vectors": resume_vectors,
            "jobs": df_jobs,
            "job_vectors": job_vectors
        }, f)
        
    print("\nSUCCESS! Data processing complete.")

STARTING DATA PROCESSING ENGINE
--- Scanning Resumes in archive(1)/data/data ---
Processed 100 resumes...
Processed 200 resumes...
Processed 300 resumes...
Processed 400 resumes...
Processed 500 resumes...
Processed 600 resumes...
Processed 700 resumes...
Processed 800 resumes...
Processed 900 resumes...
Processed 1000 resumes...
Processed 1100 resumes...
Processed 1200 resumes...
Processed 1300 resumes...
Processed 1400 resumes...
Processed 1500 resumes...
Processed 1600 resumes...
Processed 1700 resumes...
Processed 1800 resumes...
Processed 1900 resumes...
Processed 2000 resumes...
Processed 2100 resumes...
Processed 2200 resumes...
Processed 2300 resumes...
Processed 2400 resumes...
Done! Loaded 2483 total resumes.
--- Scanning Job Descriptions in jobdesc/home/sdf ---
Loaded marketing_sample_for_trulia_com-real_estate__20190901_20191031__30k_data.csv with columns: ['Job Title', 'Job Description', 'Job Type', 'Categories', 'Location', 'City', 'State', 'Country', 'Zip Code', 'Address

Batches: 100%|██████████| 78/78 [00:40<00:00,  1.91it/s]


Calculating vectors for 100 jobs...


Batches: 100%|██████████| 4/4 [00:01<00:00,  2.43it/s]



--- CALCULATING SMART ACCURACY ---
Testing 2483 resumes against 100 jobs...
Smart Accuracy: 35.72%

Saving data to 'processed_data.pkl'...

SUCCESS! Data processing complete.


In [16]:
import pandas as pd
import pickle

# Load data
with open("processed_data.pkl", "rb") as f:
    data = pickle.load(f)

df_resumes = data["resumes"]
df_jobs = data["jobs"]

print(f"\n--- DATA DISTRIBUTION CHECK ---")
print(f"Total Resumes: {len(df_resumes)}")
print(f"Total Jobs: {len(df_jobs)}")

# 1. Count Resumes per Category
print("\n--- RESUME COUNTS (By Folder) ---")
print(df_resumes['Category'].value_counts())

# 2. Count Jobs per Title (First 10)
print("\n--- JOB COUNTS (By Title) ---")
print(df_jobs['Job_Title'].value_counts().head(10))


--- DATA DISTRIBUTION CHECK ---
Total Resumes: 2483
Total Jobs: 100

--- RESUME COUNTS (By Folder) ---
Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      119
ACCOUNTANT                118
ADVOCATE                  118
CHEF                      118
ENGINEERING               118
FINANCE                   118
AVIATION                  117
FITNESS                   117
SALES                     116
HEALTHCARE                115
CONSULTANT                115
BANKING                   115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

--- JOB COUNTS (By Title) ---
Job_Title
Sales Associate/Beauty Advisor                3
Lids Assistant Manager Full-Time              2
Shif