### Import

In [1]:
import os
import pandas as pd
from services.ResumeInfoExtraction import ResumeInfoExtraction
from services.JobInfoExtraction import JobInfoExtraction
from source.schemas.resumeextracted import ResumeExtractedModel # Let's reintroduce later on
from source.schemas.jobextracted import JobExtractedModel # Let's reintroduce later on
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt
import markdown
import warnings 
import logging
import torch
import torch.nn.functional as F

logging.getLogger('pypdf').setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

# Get the absolute path of the root directory
ROOT_DIR = os.path.abspath(r'C:\Users\irene\OneDrive\Desktop\Thesis\MAIN_CODE')

# Paths to your pattern files
skills_patterns_path = os.path.join(ROOT_DIR, 'workproject_matching_algo','Resources', 'data', 'skills.jsonl')

  from tqdm.autonotebook import tqdm, trange


In [2]:
import main

In [75]:
df_resumes = main.get_resumes('resumes')

In [76]:
df_resumes = main.resume_extraction(df_resumes)
df_resumes

Unnamed: 0,name,raw,Skills
0,CV Ilaria Gioia eng,Ilaria Gioia \n \n \n \nCONTACTS \n+39 3661699...,"[marketing, communications, advertising, googl..."
1,CV Rick 2,Riccardo Lombardo \n(+39) 338 214 2704 \nricca...,"[business, finance, business administration, c..."
2,CV.Ludovica Baccilieri,\nLUDOVICA BACCILIERI \n \n \n \n \n \n \n \n...,"[business, marketing, code, customer relations..."
3,Irene Abbatelli CV,"\n \n \nIRENE ABBATELLI \nRome, Italy \n(+39)...","[business, analytics, data analysis, data visu..."


In [77]:
df_resumes = df_resumes.iloc[[0]]
df_resumes

Unnamed: 0,name,raw,Skills
0,CV Ilaria Gioia eng,Ilaria Gioia \n \n \n \nCONTACTS \n+39 3661699...,"[marketing, communications, advertising, googl..."


*conda install -c conda-forge openpyxl*

### Skill_Role df

In [6]:
file_path = os.path.join('skills_roles_data', 'Technology Skills.xlsx')
df_skill_role = pd.read_excel(file_path)
df_skill_role

Unnamed: 0,O*NET-SOC Code,Title,Example,Commodity Code,Commodity Title,Hot Technology,In Demand
0,11-1011.00,Chief Executives,Adobe Acrobat,43232202,Document management software,Y,N
1,11-1011.00,Chief Executives,AdSense Tracker,43232306,Data base user interface and query software,N,N
2,11-1011.00,Chief Executives,Atlassian JIRA,43232201,Content workflow software,Y,N
3,11-1011.00,Chief Executives,Blackbaud The Raiser's Edge,43232303,Customer relationship management CRM software,N,N
4,11-1011.00,Chief Executives,ComputerEase construction accounting software,43231601,Accounting software,N,N
...,...,...,...,...,...,...,...
32622,53-7121.00,"Tank Car, Truck, and Ship Loaders",Linux,43233004,Operating system software,Y,N
32623,53-7121.00,"Tank Car, Truck, and Ship Loaders",Microsoft Excel,43232110,Spreadsheet software,Y,N
32624,53-7121.00,"Tank Car, Truck, and Ship Loaders",Microsoft Office software,43231513,Office suite software,Y,N
32625,53-7121.00,"Tank Car, Truck, and Ship Loaders",SAP software,43231602,Enterprise resource planning ERP software,Y,N


In [7]:
df_skill_role = df_skill_role.drop(columns=['O*NET-SOC Code', 'Commodity Code', 'Commodity Title'])

In [8]:
df_skill_role_filtered = df_skill_role[(df_skill_role['Hot Technology'] == 'Y') & (df_skill_role['In Demand'] == 'Y')]
df_skill_role_filtered

Unnamed: 0,Title,Example,Hot Technology,In Demand
27,Chief Executives,Microsoft Excel,Y,Y
29,Chief Executives,Microsoft Office software,Y,Y
31,Chief Executives,Microsoft PowerPoint,Y,Y
57,Chief Sustainability Officers,Microsoft Office software,Y,Y
140,General and Operations Managers,Microsoft Excel,Y,Y
...,...,...,...,...
32460,Aviation Inspectors,Microsoft Excel,Y,Y
32464,Aviation Inspectors,Microsoft Word,Y,Y
32474,"Transportation Vehicle, Equipment and Systems ...",Microsoft Excel,Y,Y
32475,"Transportation Vehicle, Equipment and Systems ...",Microsoft Office software,Y,Y


In [9]:
df_skill_role_grouped = df_skill_role_filtered.groupby('Title')['Example'].apply(list).reset_index()
df_skill_role_grouped.rename(columns={'Example': 'Skills'}, inplace=True)
df_skill_role_grouped

Unnamed: 0,Title,Skills
0,Accountants and Auditors,"[Intuit QuickBooks, Microsoft Excel, Microsoft..."
1,Actuaries,"[Microsoft Excel, Microsoft Office software, M..."
2,Administrative Services Managers,"[Microsoft Excel, Microsoft Office software, M..."
3,"Adult Basic Education, Adult Secondary Educati...","[Microsoft Excel, Microsoft Office software]"
4,Advertising Sales Agents,"[Adobe Creative Cloud software, Adobe Illustra..."
...,...,...
448,Wind Energy Engineers,"[C++, Microsoft Excel, Python, The MathWorks M..."
449,Wind Turbine Service Technicians,"[Microsoft Office software, SAP software]"
450,Word Processors and Typists,"[Microsoft Excel, Microsoft Office software, M..."
451,Writers and Authors,"[Adobe Photoshop, Microsoft Excel, Microsoft O..."


### Job Description df

In [10]:
file_path_1 = os.path.join('skills_roles_data', 'Occupation Data.xlsx')
df_jobdescription = pd.read_excel(file_path_1)
df_jobdescription

Unnamed: 0,O*NET-SOC Code,Title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."
...,...,...,...
1011,55-3014.00,Artillery and Missile Crew Members,"Target, fire, and maintain weapons used to des..."
1012,55-3015.00,Command and Control Center Specialists,"Operate and monitor communications, detection,..."
1013,55-3016.00,Infantry,Operate weapons and equipment in ground combat...
1014,55-3018.00,Special Forces,"Implement unconventional operations by air, la..."


In [11]:
df_jobdescription.rename(columns={'Description': 'raw'}, inplace=True)
df_jobdescription = df_jobdescription.drop(columns=['O*NET-SOC Code'])

In [12]:
# Get unique titles from df_skill_role_grouped
unique_titles = df_skill_role_grouped['Title'].unique()

# Filter df_jobs to only include rows with titles in unique_titles
filtered_df_jobs = df_jobdescription[df_jobdescription['Title'].isin(unique_titles)]

filtered_df_jobs

Unnamed: 0,Title,raw
0,Chief Executives,Determine and formulate policies and provide o...
1,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,Legislators,"Develop, introduce, or enact laws and statutes..."
4,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."
...,...,...
967,"Captains, Mates, and Pilots of Water Vessels",Command or supervise operations of ships and w...
974,Traffic Technicians,Conduct field studies to determine traffic vol...
975,Transportation Inspectors,Inspect equipment or goods in connection with ...
976,Aviation Inspectors,"Inspect aircraft, maintenance procedures, air ..."


In [13]:
df_jobs = main.job_info_extraction(filtered_df_jobs)
df_jobs

Unnamed: 0,Title,raw,Skills
0,Chief Executives,Determine and formulate policies and provide o...,[operations research]
1,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",[operations research]
2,General and Operations Managers,"Plan, direct, or coordinate the operations of ...",[operations research]
3,Legislators,"Develop, introduce, or enact laws and statutes...",[operations research]
4,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...","[operations research, advertising]"
...,...,...,...
967,"Captains, Mates, and Pilots of Water Vessels",Command or supervise operations of ships and w...,[operations research]
974,Traffic Technicians,Conduct field studies to determine traffic vol...,[traffic engineering]
975,Transportation Inspectors,Inspect equipment or goods in connection with ...,[operations research]
976,Aviation Inspectors,"Inspect aircraft, maintenance procedures, air ...",[communications]


### Similarity score calculation separated 

In [45]:
def calc_similarity_one_cv(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions 
    based on MPNET embeddings of combined skills.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
    model.eval()

    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()

    # Add similarity scores to the job descriptions DataFrame
    job_df['similarity_score'] = similarity_scores

    # Rank the jobs by similarity score
    job_df['rank'] = job_df['similarity_score'].rank(ascending=False)

    # Return a DataFrame with the job titles, similarity scores, and ranks
    ranked_jobs = job_df[['Title', 'similarity_score', 'rank']].sort_values(by='rank', ascending=True)
    return ranked_jobs


In [46]:
analysis_data_skills = calc_similarity_one_cv(df_resumes, df_skill_role_grouped, parallel=True)
analysis_data_skills.head(20)

Unnamed: 0,Title,similarity_score,rank
111,Data Scientists,0.530693,1.0
442,Web Administrators,0.509691,2.0
394,Software Quality Assurance Analysts and Testers,0.501967,3.0
393,Software Developers,0.491808,4.0
92,Computer and Information Research Scientists,0.488537,5.0
380,Search Marketing Strategists,0.483493,6.0
87,Computer Programmers,0.481745,7.0
90,Computer Systems Engineers/Architects,0.479638,8.0
42,Blockchain Engineers,0.475844,9.0
114,Database Architects,0.475259,10.0


In [47]:
analysis_data_descr = calc_similarity_one_cv(df_resumes, df_jobs, parallel=True)
analysis_data_descr.head(20)

Unnamed: 0,Title,similarity_score,rank
85,Market Research Analysts and Marketing Special...,0.709185,1.0
122,Software Developers,0.627495,2.0
86,Search Marketing Strategists,0.623605,3.0
90,Online Merchants,0.605184,4.0
125,Web and Digital Interface Designers,0.60393,5.0
371,Commercial and Industrial Designers,0.591882,6.0
154,Bioengineers and Biomedical Engineers,0.59181,7.0
109,Computer Systems Analysts,0.571316,8.0
4,Advertising and Promotions Managers,0.564877,10.0
664,Order Clerks,0.564877,10.0


**Example check**

In [48]:
df_resumes['Skills'].value_counts()

Skills
[business, marketing, code, customer relationship management, business administration, international trade, communications, design, wordpress, boosting machine learning, advertising, go]    1
Name: count, dtype: int64

In [49]:
df_jobs[df_jobs['Title'] == 'Search Marketing Strategists']['Skills'].values[0]
df_jobs[df_jobs['Title'] == 'Online Merchants']['raw'].values[0]

'Conduct retail activities of businesses operating exclusively online. May perform duties such as preparing business strategies, buying merchandise, managing inventory, implementing marketing activities, fulfilling and shipping online orders, and balancing financial records.'

In [50]:
df_skill_role_grouped[df_skill_role_grouped['Title'] == 'Market Research Analysts and Marketing Specialists']['Skills'].values[0]

['Google Analytics',
 'Microsoft Excel',
 'Microsoft Office software',
 'Microsoft Outlook',
 'Microsoft PowerPoint',
 'Salesforce software']

### Similarity score calculation merged 

In [51]:
# Merge the dataframes on the Title column
merged_skills = pd.merge(
    df_jobs[['Title', 'Skills']],
    df_skill_role_grouped[['Title', 'Skills']],
    on='Title',
    how='outer',
    suffixes=('_jobs', '_resumes')
)

# Combine the lists from both 'Skills' columns into one
merged_skills['Skills'] = merged_skills.apply(
    lambda row: list(set(row['Skills_jobs'] or []) | set(row['Skills_resumes'] or [])),
    axis=1
)

# Select only the 'Title' and merged 'Skills' columns
result_df_1 = merged_skills[['Title', 'Skills']]


In [52]:
# Original multiline text
roles_text = """
Adult Basic Education, Adult Secondary Education, and English as a Second Language Instructors
Aerospace Engineers
Agents and Business Managersof Artists, Pereformers, and Athletes
Agricultural Inspectors
Agricultural Technicians
Air Traffic Controllers
Aircraft Mechanics and Service Technicians
Airfield Operations Specialists
Anesthesiologists
Animal Control Workers
Anthropologists and Archeologists
Appraisers and Assessors of Real Estate
Arbitrators, Mediators, and Conciliators
Architects, Except Landscape and Naval
Architectural and Civil Drafters
Architectural and Engineering Managers
Architecture Teachers, Postsecondary
Archivists
Art Directors
Art Therapists
Art, Drama, and Music Teachers, Postsecondary
Astronomers
Athletes and Sports Competitors
Atmospheric and Space Scientists
Audio and Video Technicians
Audiologists
Automotive Engineers
Aviation Inspectors
Avionics Technicians
Biochemists and Biophysicists
Bioengineers and Biomedical Engineers
Bioinformatics Scientists
Biological Technicians
Biologists
Biostatisticians
Broadcast Announcers and Radio Disc Jockeys
Broadcast Technicians
Brownfield Redevelopment Specialists and Site Managers
Buyers and Purchasing Agents, Farm Products
Calibration Technologists and Technicians
Camera Operators, Television, Video, and Film
Captains, Mates, and Pilots of Water Vessels
Cardiologists
Career/Technical Education Teachers, Postsecondary
Cargo and Freight Agents
Cartographers and Photogrammetrists
Chefs and Head Cooks
Chemical Engineers
Chemical Equipment Operators and Tenders
Chemical Plant and System Operators
Chemical Technicians
Chemistry Teachers, Postsecondary
Chemists
Child, Family, and School Social Workers
Civil Engineering Technologists and Technicians
Civil Engineers
Claims Adjusters, Examiners, and Investigators
Clinical Nurse Specialists
Clinical Research Coordinators
Clinical and Counseling Psychologists
Community Health Workers
Concierges
Conservation Scientists
Construction and Building Inspectors
Control and Valve Installers and Repairers, Except Mechanical Door
Correspondence Clerks
Costume Attendants
Court, Municipal, and License Clerks
Curators
Dental Assistants
Detectives and Criminal Investigators
Dietetic Technicians
Dietitians and Nutritionists
Directors, Religious Activities and Education
Dispatchers, Except Police, Fire, and Ambulance
Economists
Editors
Education Administrators, Kindergarten through Secondary
Education Administrators, Postsecondary
Education and Childcare Administrators, Preschool and Daycare
Educational, Guidance, and Career Counselors and Advisors
Electrical Engineers
Electrical and Electronic Engineering Technologists and Technicians
Electrical and Electronics Drafters
Electrical and Electronics Installers and Repairers, Transportation Equipment
Electrical and Electronics Repairers, Commercial and Industrial Equipment
Electrical and Electronics Repairers, Powerhouse, Substation, and Relay
Emergency Management Directors
Emergency Medicine Physicians
Energy Auditors
Energy Engineers, Except Wind and Solar
Entertainment and Recreation Managers, Except Gambling
Environmental Engineering Technologists and Technicians
Environmental Engineers
Environmental Restoration Planners
Environmental Science and Protection Technicians, Including Health
Environmental Scientists and Specialists, Including Health
Epidemiologists
Etchers and Engravers
Exercise Physiologists
Fabric and Apparel Patternmakers
Facilities Managers
Family Medicine Physicians
Farm and Home Management Educators
Farmers, Ranchers, and Other Agricultural Managers
Fashion Designers
File Clerks
Financial Examiners
Fine Artists, Including Painters, Sculptors, and Illustrators
Fire Inspectors and Investigators
Fire-Prevention and Protection Engineers
Fishing and Hunting Workers
Fitness and Wellness Coordinators
Flight Attendants
Food Science Technicians
Food Scientists and Technologists
Food Service Managers
Forensic Science Technicians
Forest and Conservation Workers
Foresters
Freight Forwarders
Fuel Cell Engineers
Funeral Attendants
Funeral Home Managers
Gambling Change Persons and Booth Cashiers
General Internal Medicine Physicians
Genetic Counselors
Geneticists
Geographers
Geographic Information Systems Technologists and Technicians
Geological Technicians, Except Hydrologic Technicians
Geoscientists, Except Hydrologists and Geographers
Hazardous Materials Removal Workers
Health Education Specialists
Health Information Technologists and Medical Registrars
Health and Safety Engineers, Except Mining Safety Engineers and Inspectors
Healthcare Social Workers
Hearing Aid Specialists
Histology Technicians
Historians
Hospitalists
Hotel, Motel, and Resort Desk Clerks
Human Factors Engineers and Ergonomists
Hydrologists
Industrial Ecologists
Industrial Engineering Technologists and Technicians
Industrial Engineers
Industrial Machinery Mechanics
Information Security Engineers
Inspectors, Testers, Sorters, Samplers, and Weighers
Instructional Coordinators
Insurance Claims and Policy Processing Clerks
Interior Designers
Interpreters and Translators
Judges, Magistrate Judges, and Magistrates
Judicial Law Clerks
Labor Relations Specialists
Landscape Architects
Lawyers
Legal Secretaries and Administrative Assistants
Legislators
Librarians and Media Collections Specialists
Library Assistants, Clerical
Library Technicians
Lighting Technicians
Maintenance and Repair Workers, General
Manufacturing Engineers
Marine Engineers and Naval Architects
Marriage and Family Therapists
Materials Engineers
Mathematicians
Mechanical Drafters
Mechanical Engineering Technologists and Technicians
Mechanical Engineers
Mechatronics Engineers
Medical Assistants
Medical Dosimetrists
Medical Equipment Repairers
Medical Records Specialists
Medical Secretaries and Administrative Assistants
Medical and Clinical Laboratory Technicians
Medical and Health Services Managers
Mental Health and Substance Abuse Social Workers
Merchandise Displayers and Window Trimmers
Meter Readers, Utilities
Microbiologists
Midwives
Mining and Geological Engineers, Including Mining Safety Engineers
Mobile Heavy Equipment Mechanics, Except Engines
Molecular and Cellular Biologists
Morticians, Undertakers, and Funeral Arrangers
Motorboat Mechanics and Service Technicians
Natural Sciences Managers
Neurologists
New Accounts Clerks
Non-Destructive Testing Specialists
Nuclear Engineers
Nurse Anesthetists
Nursing Instructors and Teachers, Postsecondary
Obstetricians and Gynecologists
Occupational Health and Safety Specialists
Occupational Health and Safety Technicians
Orthopedic Surgeons, Except Pediatric
Paralegals and Legal Assistants
Park Naturalists
Parking Enforcement Workers
Patient Representatives
Pediatric Surgeons
Pediatricians, General
Penetration Testers
Pesticide Handlers, Sprayers, and Applicators, Vegetation
Petroleum Engineers
Pharmacy Aides
Phlebotomists
Photographers
Photographic Process Workers and Processing Machine Operators
Photonics Engineers
Photonics Technicians
Physicists
Political Science Teachers, Postsecondary
Power Plant Operators
Precision Agriculture Technicians
Prepress Technicians and Workers
Print Binding and Finishing Workers
Private Detectives and Investigators
Probation Officers and Correctional Treatment Specialists
Producers and Directors
Proofreaders and Copy Markers
Public Safety Telecommunicators
Radiation Therapists
Radio Frequency Identification Device Specialists
Radio, Cellular, and Tower Equipment Installers and Repairers
Radiologists
Railroad Brake, Signal, and Switch Operators and Locomotive Firers
Receptionists and Information Clerks
Recreation Workers
Rehabilitation Counselors
Reservation and Transportation Ticket Agents and Travel Clerks
Residential Advisors
Robotics Engineers
Robotics Technicians
Roustabouts, Oil and Gas
School Psychologists
Security and Fire Alarm Systems Installers
Semiconductor Processing Technicians
Separating, Filtering, Clarifying, Precipitating, and Still Machine Setters, Operators, and Tenders
Set and Exhibit Designers
Shipping, Receiving, and Inventory Clerks
Slaughterers and Meat Packers
Soil and Plant Scientists
Solar Energy Systems Engineers
Solar Sales Representatives and Assessors
Special Education Teachers, Middle School
Special Education Teachers, Secondary School
Special Effects Artists and Animators
Sports Medicine Physicians
Stationary Engineers and Boiler Operators
Structural Metal Fabricators and Fitters
Substance Abuse and Behavioral Disorder Counselors
Surveying and Mapping Technicians
Surveyors
Teaching Assistants, Postsecondary
Technical Writers
Telecommunications Engineering Specialists
Telecommunications Equipment Installers and Repairers, Except Line Installers
Telemarketers
Telephone Operators
Tellers
Tire Repairers and Changers
Traffic Technicians
Transportation Engineers
Transportation Inspectors
Transportation Planners
Transportation Vehicle, Equipment and Systems Inspectors, Except Aviation
Travel Agents
Urban and Regional Planners
Urologists
Water Resource Specialists
Water and Wastewater Treatment Plant and System Operators
Water/Wastewater Engineers
Weatherization Installers and Technicians
Weighers, Measurers, Checkers, and Samplers, Recordkeeping
Welding, Soldering, and Brazing Machine Setters, Operators, and Tenders
Wholesale and Retail Buyers, Except Farm Products
Wind Energy Engineers
Wind Turbine Service Technicians
Word Processors and Typists
Writers and Authors
Zoologists and Wildlife Biologists
"""  # Include the full roles text here

# Process the text to add quotation marks and make it a valid Python list
roles_to_remove = ['{}{}'.format(role.strip(), "'") for role in roles_text.strip().splitlines() if role.strip()]

# Normalize the 'Title' column in result_df
result_df_1['Title_normalized'] = result_df_1['Title'].str.strip().str.lower()

# Fix roles_to_remove by stripping extra apostrophes and spaces, then normalize
roles_to_remove_normalized = [
    role.strip().lower().rstrip("'") for role in roles_to_remove  # Remove trailing apostrophe
]

# Filter out the roles to remove
result_df= result_df_1[~result_df_1['Title_normalized'].isin(roles_to_remove_normalized)]

# Drop the temporary normalized column
result_df = result_df.drop(columns=['Title_normalized'])
result_df

Unnamed: 0,Title,Skills
0,Accountants and Auditors,"[Microsoft Excel, SAP software, Microsoft Offi..."
1,Actuaries,"[Microsoft Power BI, Microsoft Excel, Microsof..."
2,Administrative Services Managers,"[Microsoft Word, Microsoft Excel, Microsoft Of..."
4,Advertising Sales Agents,"[Microsoft Excel, Salesforce software, Microso..."
5,Advertising and Promotions Managers,"[Microsoft Excel, Microsoft Office software, M..."
...,...,...
436,Validation Engineers,"[Microsoft Excel, Microsoft Office software, d..."
437,Video Game Designers,"[C, play, C++, Oracle Java, JavaScript, Git, d..."
442,Web Administrators,"[testing, UNIX, Linux, Cascading style sheets ..."
443,Web Developers,"[Go, Apache Kafka, Linux, server, Cascading st..."


In [78]:
analysis_data_combined = calc_similarity_one_cv(df_resumes, result_df, parallel=True)
analysis_data_combined.head(20)

Unnamed: 0,Title,similarity_score,rank
380,Search Marketing Strategists,0.766068,1.0
270,Market Research Analysts and Marketing Special...,0.69658,2.0
310,Online Merchants,0.66085,3.0
442,Web Administrators,0.639322,4.0
394,Software Quality Assurance Analysts and Testers,0.623472,5.0
444,Web and Digital Interface Designers,0.607784,6.0
383,Security Management Specialists,0.605565,7.0
393,Software Developers,0.600698,8.0
93,Computer and Information Systems Managers,0.6004,9.0
443,Web Developers,0.591584,10.0


**Example**

In [79]:
df_resumes['Skills'].value_counts()

Skills
[marketing, communications, advertising, google analytics, wordpress, webflow, structured text]    1
Name: count, dtype: int64

In [82]:

#df_relevant[df_relevant['Title'] == 'Online Merchants']['raw'].values[0]
result_df[result_df['Title'] == 'Commercial and Industrial Designers']['Skills'].values[0]


['Microsoft Excel',
 'Autodesk AutoCAD',
 'Dassault Systemes SolidWorks',
 'Microsoft Office software',
 'Figma',
 'Microsoft PowerPoint',
 'Adobe Illustrator',
 'Adobe Photoshop',
 'Adobe Creative Cloud software',
 'marketing',
 'design']

### Different calculation of similairty considering exact matching as 30% of score with merged info

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import os

def calc_similarity_with_explicit_match(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions,
    combining semantic similarity (via MPNET embeddings) and explicit keyword matching.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
   

    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    def explicit_match(title, skills, keywords):
        """
        Return the number of exact keyword matches in title or skills.
        """
        # Normalize the lists to lowercase and strip whitespace
        title_words = set(title.lower().split())
        skill_words = set(skill.lower().strip() for skill in skills)
        keyword_set = set(keyword.lower().strip() for keyword in keywords)

        # Check for exact matches in title or skills
        title_match = keyword_set & title_words  # Intersection of sets for title
        skills_match = keyword_set & skill_words  # Intersection of sets for skills

        # Return the total number of matches
        return len(title_match) + len(skills_match)
    
    def calculate_normalized_match(row, cv_skills):
        # Count the number of matches
        matches = explicit_match(row['Title'], row['Skills'], cv_skills)
        # Normalize by the combined length of job and CV skills
        normalized_score = matches / (len(row['Skills']) + len(cv_skills))
        return normalized_score

    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute semantic similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()
    job_df['semantic_similarity'] = similarity_scores
    '''
    # Compute explicit matching scores
    cv_skills = df_resumes['Skills'].iloc[0]

    # Compute number of explicit matches for each job
    result_df['explicit_match'] = job_df.apply(
        lambda row: explicit_match(row['Title'], row['Skills'], cv_skills),
        axis=1
    )

    # Compute normalized match score for each job
    result_df['normalized_match_score'] = result_df.apply(
        lambda row: calculate_normalized_match(row, cv_skills),
        axis=1
    )

    # Compute the final score as a weighted combination
    result_df['final_score'] = (
        0.8 * result_df['semantic_similarity'] +  # 80% weight for semantic similarity
        0.2 * result_df['normalized_match_score']  # 20% weight for normalized match score
    )   
    '''
    # Sort by the final score
    result_df_sorted = result_df.sort_values(by='final_score', ascending=False)

    # Return a DataFrame with the job titles, scores, and ranks
    result_df_sorted = result_df_sorted[['Title', 'semantic_similarity', 'explicit_match', 'normalized_match_score', 'final_score', 'rank']].sort_values(by='rank', ascending=True)
    return result_df_sorted


In [88]:
analysis_data_new = calc_similarity_with_explicit_match(df_resumes, result_df, parallel=True)
analysis_data_new.head(20)

Unnamed: 0,Title,semantic_similarity,explicit_match,normalized_match_score,final_score,rank
380,Search Marketing Strategists,0.766068,4,0.210526,0.65496,1.0
270,Market Research Analysts and Marketing Special...,0.69658,4,0.235294,0.604322,2.0
310,Online Merchants,0.66085,2,0.142857,0.557252,3.0
442,Web Administrators,0.639322,2,0.064516,0.524361,4.0
394,Software Quality Assurance Analysts and Testers,0.623472,0,0.0,0.498777,5.0
444,Web and Digital Interface Designers,0.607784,0,0.0,0.486227,6.0
383,Security Management Specialists,0.605565,0,0.0,0.484452,7.0
393,Software Developers,0.600698,0,0.0,0.480559,8.0
93,Computer and Information Systems Managers,0.6004,0,0.0,0.48032,9.0
443,Web Developers,0.591584,1,0.021277,0.477522,10.0


In [None]:
analysis_data_new['normalized_match_score'].value_counts()

normalized_match_score
0.000000    150
0.142857      2
0.210526      1
0.235294      1
0.064516      1
0.021277      1
0.153846      1
0.090909      1
0.166667      1
0.032258      1
0.041667      1
0.071429      1
0.058824      1
0.055556      1
0.062500      1
Name: count, dtype: int64

: 

#### Example Check

In [70]:
df_resumes['Skills'].value_counts()

Skills
[business, marketing, code, customer relationship management, business administration, international trade, communications, design, wordpress, boosting machine learning, advertising, go]    1
Name: count, dtype: int64

In [None]:

#df_relevant[df_relevant['Title'] == 'Online Merchants']['raw'].values[0]
result_df[result_df['Title'] == 'Web Developers']['Skills'].values[0]


['Spring Boot',
 'MySQL',
 'Atlassian JIRA',
 'Amazon Web Services AWS software',
 'Bootstrap',
 'Hypertext markup language HTML',
 'Vue.js',
 'Cascading style sheets CSS',
 'database',
 'Apache Kafka',
 'NoSQL',
 'WordPress',
 'Linux',
 'server',
 'Oracle Java',
 'Microsoft Azure software',
 'scalability',
 'Python',
 'Git',
 'JavaScript',
 'Spring Framework',
 'Extensible markup language XML',
 'JavaScript Object Notation JSON',
 'jQuery',
 'TypeScript',
 'C#',
 'MongoDB',
 'Go',
 'PHP',
 'React',
 'Google Angular',
 'Docker',
 'GitHub',
 'code',
 'Jenkins CI',
 'Kubernetes',
 'web development',
 'PostgreSQL',
 'Structured query language SQL',
 'Node.js']