In [52]:
import pandas as pd
import os
import re

In [53]:
postings_path = "job-csv\postings.csv"

# Load the Job Postings CSV file into a Pandas DataFrame
postings_df = pd.read_csv(postings_path)

# Display the first few rows of the dataset to understand its structure
postings_df.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0


In [54]:
postings_df.shape

(123849, 31)

In [55]:
postings_sample_df = postings_df.sample(10000)
postings_sample_df.shape

(10000, 31)

In [56]:
def preprocess_text(text):
    """Preprocess text by converting to lowercase, removing special characters, and handling NaN."""
    if pd.isnull(text):
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

def combine_features(row):
    """Combine relevant features into a single string."""
    features = []
    for col in ['title', 'description', 'skills_desc']:
        if not pd.isnull(row[col]):
            features.append(f"{col.capitalize()}: {preprocess_text(row[col])}\n")
    return ' '.join(features)

postings_sample_df['combined_features'] = postings_sample_df.apply(combine_features, axis=1)

postings_sample_df[['job_id', 'combined_features']].head()


Unnamed: 0,job_id,combined_features
123314,3906258204,Title: security officer\n Description: wage 15...
101133,3905241545,Title: interior designer\n Description: interi...
50604,3901374652,Title: ios developer\n Description: company de...
121311,3906242399,Title: visual designer\n Description: our clie...
60993,3901984921,Title: pediatrician\n Description: a recognize...


In [57]:
print(postings_sample_df.iloc[0]['combined_features'])


Title: security officer
 Description: wage 15001550 hour

we help make your world a safer place

are you interested in being part of our security team

apply quickly and efficiently online weekly pay competitive benefits employee referral bonus

security officerguard

security positions are full and part timemust have excellent customer service skillssecurity positions require you pass our drug screen and background checkmust be able to meet and continue to meet any applicable state county and municipal licensing requirements for security officersguard

job requirements of the security officersecurity guard include but are not limited to

security officerssecurity guards observes and reports activities and incidents at an assigned client site providing for the security and safety of client property and personnelfrequent sitting standing and walking which may be required for long periods of time and may involve climbing stairs and walking up inclines and on uneven terrainadditional phys

In [58]:
resume_path = 'job-resume/Resume/Resume.csv'

resume_df = pd.read_csv(resume_path)

# Display the first few rows of the dataset to understand its structure
resume_df.head()


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [59]:
def preprocess_resume_text(row):
    text = row.get('Resume_str','')
    if pd.isnull(text):
        return ""
    text = re.sub(r'[^\w\s,+./-]','',text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = text.lower()

    return text

resume_df['preprocessed_resume'] = resume_df.apply(preprocess_resume_text, axis=1)

resume_df[['ID', 'preprocessed_resume']].head()

Unnamed: 0,ID,preprocessed_resume
0,16852973,hr administrator/marketing associate hr admini...
1,22323967,"hr specialist, us hr operations summary versat..."
2,33176873,hr director summary over 20 years experience i...
3,27018550,"hr specialist summary dedicated, driven, and d..."
4,17812897,hr manager skill highlights hr skills hr depar...


In [60]:
print(resume_df.iloc[0]['preprocessed_resume'])


hr administrator/marketing associate hr administrator summary dedicated customer service manager with 15+ years of experience in hospitality and customer service management. respected builder and leader of customer-focused teams strives to instill a shared, enthusiastic commitment to customer service. highlights focused on customer satisfaction team management marketing savvy conflict resolution techniques training and development skilled multi-tasker client relations specialist accomplishments missouri dot supervisor training certification certified by ihg in customer loyalty and marketing by segment hilton worldwide general manager training certification accomplished trainer for cross server hospitality systems such as hilton onq , micros opera pms , fidelio opera reservation system ors , holidex completed courses and seminars in customer service, sales strategies, inventory control, loss prevention, safety, time management, leadership and performance assessment. experience hr admini

In [None]:
#%pip install -U voyageai
#%pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [62]:
import voyageai 
from dotenv import load_dotenv

load_dotenv()

VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")

# Initialize the API client and define embedding model
vo = voyageai.Client(api_key=VOYAGE_API_KEY)
embedding_model = "voyage-3-large"

def generate_embeddings(texts, batch_size=128):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        try:
            result = vo.embed(batch, model=embedding_model, input_type="document")
            all_embeddings.extend(result.embeddings)
        except Exception as e:
            print(f"Error generating embeddings: {e}")
            return None
    return all_embeddings

# Generate embeddings with batching
postings_sample_df['embeddings'] = generate_embeddings(postings_sample_df['combined_features'].tolist())

In [63]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_jobs(resume_embedding, top_n=5):
    similarities = cosine_similarity([resume_embedding], postings_sample_df["embeddings"].tolist())[0]
    top_indices = similarities.argsort()[-top_n:][::-1]
    return top_indices 

resume_data = resume_df.iloc[0]["preprocessed_resume"]
resume_embedding = generate_embeddings(resume_data)[0]
similar_job_indices = find_similar_jobs(resume_embedding)

print("Top job matches for resume:")
print(resume_df.iloc[0]['Resume_str'])
print("-" * 50)

for i, job_index in enumerate(similar_job_indices): 
    job_posting = postings_sample_df.iloc[job_index]
    print(f"Match {i+1}: {job_posting['title']}")
    print(f"Similarity Score: {cosine_similarity([resume_embedding], [job_posting['embeddings']])[0][0]:.4f}")
    print(f"Description: {job_posting['description']}")
    print("-" * 50)

Top job matches for resume:
         HR ADMINISTRATOR/MARKETING ASSOCIATE

HR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    Completed courses and seminars in customer service, sales strategies, i