### Import

In [1]:
import os
import pandas as pd
from services.ResumeInfoExtraction import ResumeInfoExtraction
from services.JobInfoExtraction import JobInfoExtraction
from source.schemas.resumeextracted import ResumeExtractedModel # Let's reintroduce later on
from source.schemas.jobextracted import JobExtractedModel # Let's reintroduce later on
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt
import markdown
import warnings 
import logging
import torch
import torch.nn.functional as F

logging.getLogger('pypdf').setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


In [2]:
import main

In [31]:
df_resumes = main.get_resumes('resumes')

In [32]:
df_resumes = main.resume_extraction(df_resumes)
df_resumes

Unnamed: 0,name,raw,Skills
0,CV Ilaria Gioia eng,Ilaria Gioia \n \n \n \nCONTACTS \n+39 3661699...,"[marketing, communications, Canva, Microsoft W..."
1,CV Rick 2,Riccardo Lombardo \n(+39) 338 214 2704 \nricca...,"[business, finance, business administration, c..."
2,CV.Ludovica Baccilieri,\nLUDOVICA BACCILIERI \n \n \n \n \n \n \n \n...,"[business, marketing, code, customer relations..."
3,hanna_pedersen,\n \n \n \n \n \n \n \n \nEDUCATION \n \nNova...,"[business, analytics, business administration,..."
4,Irene Abbatelli CV,"\n \n \nIRENE ABBATELLI \nRome, Italy \n(+39)...","[business, analytics, data analysis, data visu..."
5,Luca_Oeztekin-checkpoint,"LUCA OEZTEKIN\nLisbon, Portugal & Cologne, Ger...","[analytics, artificial intelligence, modelling..."
6,Resume_Billi_Leonardo,"\nLEONARDO BILLI \n \n \n \n \n \nRome, Ital...","[business, project management, finance, busine..."
7,Tim_gunkel2,TIM GUNKEL \n \n \n \n \n \n \n ...,"[business, analytics, machine learning, busine..."
8,victor_bjorsvik-checkpoint,"VICTOR BJORSVIK \n \nLisbon, Portugal | +47 91...","[accounting, big data, communications, busines..."


In [33]:
df_resumes = df_resumes.iloc[[1]]
df_resumes

Unnamed: 0,name,raw,Skills
1,CV Rick 2,Riccardo Lombardo \n(+39) 338 214 2704 \nricca...,"[business, finance, business administration, c..."


*conda install -c conda-forge openpyxl*

### Skill_Role df

In [9]:
file_path = os.path.join('skills_roles_data', 'Technology Skills.xlsx')
df_skill_role = pd.read_excel(file_path)
df_skill_role

Unnamed: 0,O*NET-SOC Code,Title,Example,Commodity Code,Commodity Title,Hot Technology,In Demand
0,11-1011.00,Chief Executives,Adobe Acrobat,43232202,Document management software,Y,N
1,11-1011.00,Chief Executives,AdSense Tracker,43232306,Data base user interface and query software,N,N
2,11-1011.00,Chief Executives,Atlassian JIRA,43232201,Content workflow software,Y,N
3,11-1011.00,Chief Executives,Blackbaud The Raiser's Edge,43232303,Customer relationship management CRM software,N,N
4,11-1011.00,Chief Executives,ComputerEase construction accounting software,43231601,Accounting software,N,N
...,...,...,...,...,...,...,...
32622,53-7121.00,"Tank Car, Truck, and Ship Loaders",Linux,43233004,Operating system software,Y,N
32623,53-7121.00,"Tank Car, Truck, and Ship Loaders",Microsoft Excel,43232110,Spreadsheet software,Y,N
32624,53-7121.00,"Tank Car, Truck, and Ship Loaders",Microsoft Office software,43231513,Office suite software,Y,N
32625,53-7121.00,"Tank Car, Truck, and Ship Loaders",SAP software,43231602,Enterprise resource planning ERP software,Y,N


In [10]:
df_skill_role = df_skill_role.drop(columns=['O*NET-SOC Code', 'Commodity Code', 'Commodity Title'])

In [11]:
df_skill_role_filtered = df_skill_role[(df_skill_role['Hot Technology'] == 'Y') & (df_skill_role['In Demand'] == 'Y')]
df_skill_role_filtered

Unnamed: 0,Title,Example,Hot Technology,In Demand
27,Chief Executives,Microsoft Excel,Y,Y
29,Chief Executives,Microsoft Office software,Y,Y
31,Chief Executives,Microsoft PowerPoint,Y,Y
57,Chief Sustainability Officers,Microsoft Office software,Y,Y
140,General and Operations Managers,Microsoft Excel,Y,Y
...,...,...,...,...
32460,Aviation Inspectors,Microsoft Excel,Y,Y
32464,Aviation Inspectors,Microsoft Word,Y,Y
32474,"Transportation Vehicle, Equipment and Systems ...",Microsoft Excel,Y,Y
32475,"Transportation Vehicle, Equipment and Systems ...",Microsoft Office software,Y,Y


In [12]:
df_skill_role_grouped = df_skill_role_filtered.groupby('Title')['Example'].apply(list).reset_index()
df_skill_role_grouped.rename(columns={'Example': 'Skills'}, inplace=True)
df_skill_role_grouped

Unnamed: 0,Title,Skills
0,Accountants and Auditors,"[Intuit QuickBooks, Microsoft Excel, Microsoft..."
1,Actuaries,"[Microsoft Excel, Microsoft Office software, M..."
2,Administrative Services Managers,"[Microsoft Excel, Microsoft Office software, M..."
3,"Adult Basic Education, Adult Secondary Educati...","[Microsoft Excel, Microsoft Office software]"
4,Advertising Sales Agents,"[Adobe Creative Cloud software, Adobe Illustra..."
...,...,...
448,Wind Energy Engineers,"[C++, Microsoft Excel, Python, The MathWorks M..."
449,Wind Turbine Service Technicians,"[Microsoft Office software, SAP software]"
450,Word Processors and Typists,"[Microsoft Excel, Microsoft Office software, M..."
451,Writers and Authors,"[Adobe Photoshop, Microsoft Excel, Microsoft O..."


### Job Description df

In [13]:
file_path_1 = os.path.join('skills_roles_data', 'Occupation Data.xlsx')
df_jobdescription = pd.read_excel(file_path_1)
df_jobdescription

Unnamed: 0,O*NET-SOC Code,Title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."
...,...,...,...
1011,55-3014.00,Artillery and Missile Crew Members,"Target, fire, and maintain weapons used to des..."
1012,55-3015.00,Command and Control Center Specialists,"Operate and monitor communications, detection,..."
1013,55-3016.00,Infantry,Operate weapons and equipment in ground combat...
1014,55-3018.00,Special Forces,"Implement unconventional operations by air, la..."


In [14]:
df_jobdescription.rename(columns={'Description': 'raw'}, inplace=True)
df_jobdescription = df_jobdescription.drop(columns=['O*NET-SOC Code'])

In [15]:
# Get unique titles from df_skill_role_grouped
unique_titles = df_skill_role_grouped['Title'].unique()

# Filter df_jobs to only include rows with titles in unique_titles
filtered_df_jobs = df_jobdescription[df_jobdescription['Title'].isin(unique_titles)]

filtered_df_jobs

Unnamed: 0,Title,raw
0,Chief Executives,Determine and formulate policies and provide o...
1,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,Legislators,"Develop, introduce, or enact laws and statutes..."
4,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."
...,...,...
967,"Captains, Mates, and Pilots of Water Vessels",Command or supervise operations of ships and w...
974,Traffic Technicians,Conduct field studies to determine traffic vol...
975,Transportation Inspectors,Inspect equipment or goods in connection with ...
976,Aviation Inspectors,"Inspect aircraft, maintenance procedures, air ..."


In [16]:
# Convert the numpy array to a list
unique_titles_list = unique_titles.tolist()

# Define the file path
output_file_path = 'unique_titles.txt'

# Write the list to a text file
with open(output_file_path, 'w') as file:
    for title in unique_titles_list:
        file.write(f"{title}\n")

print(f"Unique titles have been written to {output_file_path}")

Unique titles have been written to unique_titles.txt


In [17]:
df_jobs = main.job_info_extraction(filtered_df_jobs)
df_jobs

Unnamed: 0,Title,raw,Skills
0,Chief Executives,Determine and formulate policies and provide o...,[operations research]
1,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",[operations research]
2,General and Operations Managers,"Plan, direct, or coordinate the operations of ...",[operations research]
3,Legislators,"Develop, introduce, or enact laws and statutes...",[operations research]
4,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...","[operations research, advertising]"
...,...,...,...
967,"Captains, Mates, and Pilots of Water Vessels",Command or supervise operations of ships and w...,[operations research]
974,Traffic Technicians,Conduct field studies to determine traffic vol...,[traffic engineering]
975,Transportation Inspectors,Inspect equipment or goods in connection with ...,[operations research]
976,Aviation Inspectors,"Inspect aircraft, maintenance procedures, air ...",[communications]


### Similarity score calculation separated 

In [18]:
# REMOVING SOME ROLES

# Original multiline text
roles_text = """
Agents and Business Managers of Artists, Performers, and Athletes
Adult Basic Education, Adult Secondary Education, and English as a Second Language Instructors
Aerospace Engineers
Agents and Business Managersof Artists, Pereformers, and Athletes
Agricultural Inspectors
Agricultural Technicians
Air Traffic Controllers
Aircraft Mechanics and Service Technicians
Airfield Operations Specialists
Anesthesiologists
Animal Control Workers
Anthropologists and Archeologists
Appraisers and Assessors of Real Estate
Arbitrators, Mediators, and Conciliators
Architects, Except Landscape and Naval
Architectural and Civil Drafters
Architectural and Engineering Managers
Architecture Teachers, Postsecondary
Archivists
Art Directors
Art Therapists
Art, Drama, and Music Teachers, Postsecondary
Astronomers
Athletes and Sports Competitors
Atmospheric and Space Scientists
Audio and Video Technicians
Audiologists
Automotive Engineers
Aviation Inspectors
Avionics Technicians
Biochemists and Biophysicists
Bioengineers and Biomedical Engineers
Bioinformatics Scientists
Biological Technicians
Biologists
Biostatisticians
Broadcast Announcers and Radio Disc Jockeys
Broadcast Technicians
Brownfield Redevelopment Specialists and Site Managers
Buyers and Purchasing Agents, Farm Products
Calibration Technologists and Technicians
Camera Operators, Television, Video, and Film
Captains, Mates, and Pilots of Water Vessels
Cardiologists
Career/Technical Education Teachers, Postsecondary
Cargo and Freight Agents
Cartographers and Photogrammetrists
Chefs and Head Cooks
Chemical Engineers
Chemical Equipment Operators and Tenders
Chemical Plant and System Operators
Chemical Technicians
Chemistry Teachers, Postsecondary
Chemists
Child, Family, and School Social Workers
Civil Engineering Technologists and Technicians
Civil Engineers
Claims Adjusters, Examiners, and Investigators
Clinical Nurse Specialists
Clinical Research Coordinators
Clinical and Counseling Psychologists
Community Health Workers
Concierges
Conservation Scientists
Construction and Building Inspectors
Control and Valve Installers and Repairers, Except Mechanical Door
Correspondence Clerks
Costume Attendants
Court, Municipal, and License Clerks
Curators
Dental Assistants
Detectives and Criminal Investigators
Dietetic Technicians
Dietitians and Nutritionists
Directors, Religious Activities and Education
Dispatchers, Except Police, Fire, and Ambulance
Economists
Editors
Education Administrators, Kindergarten through Secondary
Education Administrators, Postsecondary
Education and Childcare Administrators, Preschool and Daycare
Educational, Guidance, and Career Counselors and Advisors
Electrical Engineers
Electrical and Electronic Engineering Technologists and Technicians
Electrical and Electronics Drafters
Electrical and Electronics Installers and Repairers, Transportation Equipment
Electrical and Electronics Repairers, Commercial and Industrial Equipment
Electrical and Electronics Repairers, Powerhouse, Substation, and Relay
Emergency Management Directors
Emergency Medicine Physicians
Energy Auditors
Energy Engineers, Except Wind and Solar
Entertainment and Recreation Managers, Except Gambling
Environmental Engineering Technologists and Technicians
Environmental Engineers
Environmental Restoration Planners
Environmental Science and Protection Technicians, Including Health
Environmental Scientists and Specialists, Including Health
Epidemiologists
Etchers and Engravers
Exercise Physiologists
Fabric and Apparel Patternmakers
Facilities Managers
Family Medicine Physicians
Farm and Home Management Educators
Farmers, Ranchers, and Other Agricultural Managers
Fashion Designers
File Clerks
Financial Examiners
Fine Artists, Including Painters, Sculptors, and Illustrators
Fire Inspectors and Investigators
Fire-Prevention and Protection Engineers
Fishing and Hunting Workers
Fitness and Wellness Coordinators
Flight Attendants
Food Science Technicians
Food Scientists and Technologists
Food Service Managers
Forensic Science Technicians
Forest and Conservation Workers
Foresters
Freight Forwarders
Fuel Cell Engineers
Funeral Attendants
Funeral Home Managers
Gambling Change Persons and Booth Cashiers
General Internal Medicine Physicians
Genetic Counselors
Geneticists
Geographers
Geographic Information Systems Technologists and Technicians
Geological Technicians, Except Hydrologic Technicians
Geoscientists, Except Hydrologists and Geographers
Hazardous Materials Removal Workers
Health Education Specialists
Health Information Technologists and Medical Registrars
Health and Safety Engineers, Except Mining Safety Engineers and Inspectors
Healthcare Social Workers
Hearing Aid Specialists
Histology Technicians
Historians
Hospitalists
Hotel, Motel, and Resort Desk Clerks
Human Factors Engineers and Ergonomists
Hydrologists
Industrial Ecologists
Industrial Engineering Technologists and Technicians
Industrial Engineers
Industrial Machinery Mechanics
Information Security Engineers
Inspectors, Testers, Sorters, Samplers, and Weighers
Instructional Coordinators
Insurance Claims and Policy Processing Clerks
Interior Designers
Interpreters and Translators
Judges, Magistrate Judges, and Magistrates
Judicial Law Clerks
Labor Relations Specialists
Landscape Architects
Lawyers
Legal Secretaries and Administrative Assistants
Legislators
Librarians and Media Collections Specialists
Library Assistants, Clerical
Library Technicians
Lighting Technicians
Maintenance and Repair Workers, General
Manufacturing Engineers
Marine Engineers and Naval Architects
Marriage and Family Therapists
Materials Engineers
Mathematicians
Mechanical Drafters
Mechanical Engineering Technologists and Technicians
Mechanical Engineers
Mechatronics Engineers
Medical Assistants
Medical Dosimetrists
Medical Equipment Repairers
Medical Records Specialists
Medical Secretaries and Administrative Assistants
Medical and Clinical Laboratory Technicians
Medical and Health Services Managers
Mental Health and Substance Abuse Social Workers
Merchandise Displayers and Window Trimmers
Meter Readers, Utilities
Microbiologists
Midwives
Mining and Geological Engineers, Including Mining Safety Engineers
Mobile Heavy Equipment Mechanics, Except Engines
Molecular and Cellular Biologists
Morticians, Undertakers, and Funeral Arrangers
Motorboat Mechanics and Service Technicians
Natural Sciences Managers
Neurologists
New Accounts Clerks
Non-Destructive Testing Specialists
Nuclear Engineers
Nurse Anesthetists
Nursing Instructors and Teachers, Postsecondary
Obstetricians and Gynecologists
Occupational Health and Safety Specialists
Occupational Health and Safety Technicians
Orthopedic Surgeons, Except Pediatric
Paralegals and Legal Assistants
Park Naturalists
Parking Enforcement Workers
Patient Representatives
Pediatric Surgeons
Pediatricians, General
Penetration Testers
Pesticide Handlers, Sprayers, and Applicators, Vegetation
Petroleum Engineers
Pharmacy Aides
Phlebotomists
Photographers
Photographic Process Workers and Processing Machine Operators
Photonics Engineers
Photonics Technicians
Physicists
Political Science Teachers, Postsecondary
Power Plant Operators
Precision Agriculture Technicians
Prepress Technicians and Workers
Print Binding and Finishing Workers
Private Detectives and Investigators
Probation Officers and Correctional Treatment Specialists
Producers and Directors
Proofreaders and Copy Markers
Public Safety Telecommunicators
Radiation Therapists
Radio Frequency Identification Device Specialists
Radio, Cellular, and Tower Equipment Installers and Repairers
Radiologists
Railroad Brake, Signal, and Switch Operators and Locomotive Firers
Receptionists and Information Clerks
Recreation Workers
Rehabilitation Counselors
Reservation and Transportation Ticket Agents and Travel Clerks
Residential Advisors
Robotics Engineers
Robotics Technicians
Roustabouts, Oil and Gas
School Psychologists
Security and Fire Alarm Systems Installers
Semiconductor Processing Technicians
Separating, Filtering, Clarifying, Precipitating, and Still Machine Setters, Operators, and Tenders
Set and Exhibit Designers
Shipping, Receiving, and Inventory Clerks
Slaughterers and Meat Packers
Soil and Plant Scientists
Solar Energy Systems Engineers
Solar Sales Representatives and Assessors
Special Education Teachers, Middle School
Special Education Teachers, Secondary School
Special Effects Artists and Animators
Sports Medicine Physicians
Stationary Engineers and Boiler Operators
Structural Metal Fabricators and Fitters
Substance Abuse and Behavioral Disorder Counselors
Surveying and Mapping Technicians
Surveyors
Teaching Assistants, Postsecondary
Technical Writers
Telecommunications Engineering Specialists
Telecommunications Equipment Installers and Repairers, Except Line Installers
Telemarketers
Telephone Operators
Tellers
Tire Repairers and Changers
Traffic Technicians
Transportation Engineers
Transportation Inspectors
Transportation Planners
Transportation Vehicle, Equipment and Systems Inspectors, Except Aviation
Travel Agents
Urban and Regional Planners
Urologists
Water Resource Specialists
Water and Wastewater Treatment Plant and System Operators
Water/Wastewater Engineers
Weatherization Installers and Technicians
Weighers, Measurers, Checkers, and Samplers, Recordkeeping
Welding, Soldering, and Brazing Machine Setters, Operators, and Tenders
Wholesale and Retail Buyers, Except Farm Products
Wind Energy Engineers
Wind Turbine Service Technicians
Word Processors and Typists
Writers and Authors
Zoologists and Wildlife Biologists
"""  # Include the full roles text here

# Process the text to add quotation marks and make it a valid Python list
roles_to_remove = ['{}{}'.format(role.strip(), "'") for role in roles_text.strip().splitlines() if role.strip()]

# Revome the roles that are in the roles_to_remove list for df_skill_role_grouped

# Normalize the 'Title' column in result_df
df_jobs['Title_normalized'] = df_jobs['Title'].str.strip().str.lower()

# Fix roles_to_remove by stripping extra apostrophes and spaces, then normalize
roles_to_remove_normalized = [
    role.strip().lower().rstrip("'") for role in roles_to_remove  # Remove trailing apostrophe
]

# Filter out the roles to remove
df_jobs= df_jobs[~df_jobs['Title_normalized'].isin(roles_to_remove_normalized)]

# Drop the temporary normalized column
df_jobs = df_jobs.drop(columns=['Title_normalized'])



# Revome the roles that are in the roles_to_remove list for df_skill_role_grouped

# Normalize the 'Title' column in result_df
df_skill_role_grouped['Title_normalized'] = df_skill_role_grouped['Title'].str.strip().str.lower()

# Fix roles_to_remove by stripping extra apostrophes and spaces, then normalize
roles_to_remove_normalized = [
    role.strip().lower().rstrip("'") for role in roles_to_remove  # Remove trailing apostrophe
]

# Filter out the roles to remove
df_skill_role_grouped= df_skill_role_grouped[~df_skill_role_grouped['Title_normalized'].isin(roles_to_remove_normalized)]

# Drop the temporary normalized column
df_skill_role_grouped = df_skill_role_grouped.drop(columns=['Title_normalized'])

In [19]:
def calc_similarity_one_cv(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions 
    based on MPNET embeddings of combined skills.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
    model.eval()

    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()

    # Add similarity scores to the job descriptions DataFrame
    job_df['similarity_score'] = similarity_scores

    # Rank the jobs by similarity score
    job_df['rank'] = job_df['similarity_score'].rank(ascending=False)

    # Return a DataFrame with the job titles, similarity scores, and ranks
    ranked_jobs = job_df[['Title', 'similarity_score', 'rank']].sort_values(by='rank', ascending=True)
    return ranked_jobs


In [20]:
analysis_data_skills = calc_similarity_one_cv(df_resumes, df_skill_role_grouped, parallel=True)
analysis_data_skills.head(20)

Unnamed: 0,Title,similarity_score,rank
394,Software Quality Assurance Analysts and Testers,0.574147,1.0
111,Data Scientists,0.554092,2.0
50,Business Intelligence Analysts,0.551776,3.0
114,Database Architects,0.547947,4.0
93,Computer and Information Systems Managers,0.538427,5.0
404,Statisticians,0.532694,6.0
166,Financial and Investment Analysts,0.532059,7.0
89,Computer Systems Analysts,0.523032,8.0
262,Logistics Analysts,0.520092,9.0
112,Data Warehousing Specialists,0.519424,10.0


In [21]:
analysis_data_descr = calc_similarity_one_cv(df_resumes, df_jobs, parallel=True)
analysis_data_descr.head(20)

Unnamed: 0,Title,similarity_score,rank
122,Software Developers,0.702189,1.0
109,Computer Systems Analysts,0.658257,2.0
636,Sales Engineers,0.654004,3.0
646,"Bookkeeping, Accounting, and Auditing Clerks",0.590135,4.0
85,Market Research Analysts and Marketing Special...,0.580015,5.0
88,Business Continuity Planners,0.573385,6.0
655,"Credit Authorizers, Checkers, and Clerks",0.561927,8.0
140,Statisticians,0.561927,8.0
104,"Tax Examiners and Collectors, and Revenue Agents",0.561927,8.0
90,Online Merchants,0.55144,10.0


**Example check**

In [22]:
df_resumes['Skills'].value_counts()

Skills
[business, project management, finance, business administration, php, sql, operations research]    1
Name: count, dtype: int64

In [23]:
df_jobs[df_jobs['Title'] == 'Computer Systems Analysts']['Skills'].values[0]

['engineering',
 'business',
 'data processing',
 'operations research',
 'systems management',
 'schedule',
 'software']

In [24]:
df_skill_role_grouped[df_skill_role_grouped['Title'] == 'Computer Systems Analysts']['Skills'].values[0]

['Atlassian JIRA',
 'Microsoft Excel',
 'Microsoft Office software',
 'Microsoft PowerPoint',
 'Microsoft SharePoint',
 'Microsoft Visio',
 'Python',
 'SAP software',
 'ServiceNow',
 'Structured query language SQL']

### Similarity score calculation merged 

In [34]:
# Merge the dataframes on the Title column
merged_skills = pd.merge(
    df_jobs[['Title', 'Skills']],
    df_skill_role_grouped[['Title', 'Skills']],
    on='Title',
    how='outer',
    suffixes=('_jobs', '_hard')
)

# Combine the lists from both 'Skills' columns into one
merged_skills['Skills'] = merged_skills.apply(
    lambda row: list(set(row['Skills_jobs'] or []) | set(row['Skills_hard'] or [])),
    axis=1
)

# Select only the 'Title' and merged 'Skills' columns
result_df_1 = merged_skills[['Title', 'Skills']]  


In [35]:
# Original multiline text
roles_text = """
Agents and Business Managers of Artists, Performers, and Athletes
Adult Basic Education, Adult Secondary Education, and English as a Second Language Instructors
Aerospace Engineers
Agents and Business Managersof Artists, Pereformers, and Athletes
Agricultural Inspectors
Agricultural Technicians
Air Traffic Controllers
Aircraft Mechanics and Service Technicians
Airfield Operations Specialists
Anesthesiologists
Animal Control Workers
Anthropologists and Archeologists
Appraisers and Assessors of Real Estate
Arbitrators, Mediators, and Conciliators
Architects, Except Landscape and Naval
Architectural and Civil Drafters
Architectural and Engineering Managers
Architecture Teachers, Postsecondary
Archivists
Art Directors
Art Therapists
Art, Drama, and Music Teachers, Postsecondary
Astronomers
Athletes and Sports Competitors
Atmospheric and Space Scientists
Audio and Video Technicians
Audiologists
Automotive Engineers
Aviation Inspectors
Avionics Technicians
Biochemists and Biophysicists
Bioengineers and Biomedical Engineers
Bioinformatics Scientists
Biological Technicians
Biologists
Biostatisticians
Broadcast Announcers and Radio Disc Jockeys
Broadcast Technicians
Brownfield Redevelopment Specialists and Site Managers
Buyers and Purchasing Agents, Farm Products
Calibration Technologists and Technicians
Camera Operators, Television, Video, and Film
Captains, Mates, and Pilots of Water Vessels
Cardiologists
Career/Technical Education Teachers, Postsecondary
Cargo and Freight Agents
Cartographers and Photogrammetrists
Chefs and Head Cooks
Chemical Engineers
Chemical Equipment Operators and Tenders
Chemical Plant and System Operators
Chemical Technicians
Chemistry Teachers, Postsecondary
Chemists
Child, Family, and School Social Workers
Civil Engineering Technologists and Technicians
Civil Engineers
Claims Adjusters, Examiners, and Investigators
Clinical Nurse Specialists
Clinical Research Coordinators
Clinical and Counseling Psychologists
Community Health Workers
Concierges
Conservation Scientists
Construction and Building Inspectors
Control and Valve Installers and Repairers, Except Mechanical Door
Correspondence Clerks
Costume Attendants
Court, Municipal, and License Clerks
Curators
Dental Assistants
Detectives and Criminal Investigators
Dietetic Technicians
Dietitians and Nutritionists
Directors, Religious Activities and Education
Dispatchers, Except Police, Fire, and Ambulance
Economists
Editors
Education Administrators, Kindergarten through Secondary
Education Administrators, Postsecondary
Education and Childcare Administrators, Preschool and Daycare
Educational, Guidance, and Career Counselors and Advisors
Electrical Engineers
Electrical and Electronic Engineering Technologists and Technicians
Electrical and Electronics Drafters
Electrical and Electronics Installers and Repairers, Transportation Equipment
Electrical and Electronics Repairers, Commercial and Industrial Equipment
Electrical and Electronics Repairers, Powerhouse, Substation, and Relay
Emergency Management Directors
Emergency Medicine Physicians
Energy Auditors
Energy Engineers, Except Wind and Solar
Entertainment and Recreation Managers, Except Gambling
Environmental Engineering Technologists and Technicians
Environmental Engineers
Environmental Restoration Planners
Environmental Science and Protection Technicians, Including Health
Environmental Scientists and Specialists, Including Health
Epidemiologists
Etchers and Engravers
Exercise Physiologists
Fabric and Apparel Patternmakers
Facilities Managers
Family Medicine Physicians
Farm and Home Management Educators
Farmers, Ranchers, and Other Agricultural Managers
Fashion Designers
File Clerks
Financial Examiners
Fine Artists, Including Painters, Sculptors, and Illustrators
Fire Inspectors and Investigators
Fire-Prevention and Protection Engineers
Fishing and Hunting Workers
Fitness and Wellness Coordinators
Flight Attendants
Food Science Technicians
Food Scientists and Technologists
Food Service Managers
Forensic Science Technicians
Forest and Conservation Workers
Foresters
Freight Forwarders
Fuel Cell Engineers
Funeral Attendants
Funeral Home Managers
Gambling Change Persons and Booth Cashiers
General Internal Medicine Physicians
Genetic Counselors
Geneticists
Geographers
Geographic Information Systems Technologists and Technicians
Geological Technicians, Except Hydrologic Technicians
Geoscientists, Except Hydrologists and Geographers
Hazardous Materials Removal Workers
Health Education Specialists
Health Information Technologists and Medical Registrars
Health and Safety Engineers, Except Mining Safety Engineers and Inspectors
Healthcare Social Workers
Hearing Aid Specialists
Histology Technicians
Historians
Hospitalists
Hotel, Motel, and Resort Desk Clerks
Human Factors Engineers and Ergonomists
Hydrologists
Industrial Ecologists
Industrial Engineering Technologists and Technicians
Industrial Engineers
Industrial Machinery Mechanics
Information Security Engineers
Inspectors, Testers, Sorters, Samplers, and Weighers
Instructional Coordinators
Insurance Claims and Policy Processing Clerks
Interior Designers
Interpreters and Translators
Judges, Magistrate Judges, and Magistrates
Judicial Law Clerks
Labor Relations Specialists
Landscape Architects
Lawyers
Legal Secretaries and Administrative Assistants
Legislators
Librarians and Media Collections Specialists
Library Assistants, Clerical
Library Technicians
Lighting Technicians
Maintenance and Repair Workers, General
Manufacturing Engineers
Marine Engineers and Naval Architects
Marriage and Family Therapists
Materials Engineers
Mathematicians
Mechanical Drafters
Mechanical Engineering Technologists and Technicians
Mechanical Engineers
Mechatronics Engineers
Medical Assistants
Medical Dosimetrists
Medical Equipment Repairers
Medical Records Specialists
Medical Secretaries and Administrative Assistants
Medical and Clinical Laboratory Technicians
Medical and Health Services Managers
Mental Health and Substance Abuse Social Workers
Merchandise Displayers and Window Trimmers
Meter Readers, Utilities
Microbiologists
Midwives
Mining and Geological Engineers, Including Mining Safety Engineers
Mobile Heavy Equipment Mechanics, Except Engines
Molecular and Cellular Biologists
Morticians, Undertakers, and Funeral Arrangers
Motorboat Mechanics and Service Technicians
Natural Sciences Managers
Neurologists
New Accounts Clerks
Non-Destructive Testing Specialists
Nuclear Engineers
Nurse Anesthetists
Nursing Instructors and Teachers, Postsecondary
Obstetricians and Gynecologists
Occupational Health and Safety Specialists
Occupational Health and Safety Technicians
Orthopedic Surgeons, Except Pediatric
Paralegals and Legal Assistants
Park Naturalists
Parking Enforcement Workers
Patient Representatives
Pediatric Surgeons
Pediatricians, General
Penetration Testers
Pesticide Handlers, Sprayers, and Applicators, Vegetation
Petroleum Engineers
Pharmacy Aides
Phlebotomists
Photographers
Photographic Process Workers and Processing Machine Operators
Photonics Engineers
Photonics Technicians
Physicists
Political Science Teachers, Postsecondary
Power Plant Operators
Precision Agriculture Technicians
Prepress Technicians and Workers
Print Binding and Finishing Workers
Private Detectives and Investigators
Probation Officers and Correctional Treatment Specialists
Producers and Directors
Proofreaders and Copy Markers
Public Safety Telecommunicators
Radiation Therapists
Radio Frequency Identification Device Specialists
Radio, Cellular, and Tower Equipment Installers and Repairers
Radiologists
Railroad Brake, Signal, and Switch Operators and Locomotive Firers
Receptionists and Information Clerks
Recreation Workers
Rehabilitation Counselors
Reservation and Transportation Ticket Agents and Travel Clerks
Residential Advisors
Robotics Engineers
Robotics Technicians
Roustabouts, Oil and Gas
School Psychologists
Security and Fire Alarm Systems Installers
Semiconductor Processing Technicians
Separating, Filtering, Clarifying, Precipitating, and Still Machine Setters, Operators, and Tenders
Set and Exhibit Designers
Shipping, Receiving, and Inventory Clerks
Slaughterers and Meat Packers
Soil and Plant Scientists
Solar Energy Systems Engineers
Solar Sales Representatives and Assessors
Special Education Teachers, Middle School
Special Education Teachers, Secondary School
Special Effects Artists and Animators
Sports Medicine Physicians
Stationary Engineers and Boiler Operators
Structural Metal Fabricators and Fitters
Substance Abuse and Behavioral Disorder Counselors
Surveying and Mapping Technicians
Surveyors
Teaching Assistants, Postsecondary
Technical Writers
Telecommunications Engineering Specialists
Telecommunications Equipment Installers and Repairers, Except Line Installers
Telemarketers
Telephone Operators
Tellers
Tire Repairers and Changers
Traffic Technicians
Transportation Engineers
Transportation Inspectors
Transportation Planners
Transportation Vehicle, Equipment and Systems Inspectors, Except Aviation
Travel Agents
Urban and Regional Planners
Urologists
Water Resource Specialists
Water and Wastewater Treatment Plant and System Operators
Water/Wastewater Engineers
Weatherization Installers and Technicians
Weighers, Measurers, Checkers, and Samplers, Recordkeeping
Welding, Soldering, and Brazing Machine Setters, Operators, and Tenders
Wholesale and Retail Buyers, Except Farm Products
Wind Energy Engineers
Wind Turbine Service Technicians
Word Processors and Typists
Writers and Authors
Zoologists and Wildlife Biologists
"""  # Include the full roles text here

# Process the text to add quotation marks and make it a valid Python list
roles_to_remove = ['{}{}'.format(role.strip(), "'") for role in roles_text.strip().splitlines() if role.strip()]

# Normalize the 'Title' column in result_df
result_df_1['Title_normalized'] = result_df_1['Title'].str.strip().str.lower()

# Fix roles_to_remove by stripping extra apostrophes and spaces, then normalize
roles_to_remove_normalized = [
    role.strip().lower().rstrip("'") for role in roles_to_remove  # Remove trailing apostrophe
]

# Filter out the roles to remove
result_df= result_df_1[~result_df_1['Title_normalized'].isin(roles_to_remove_normalized)]

# Drop the temporary normalized column
result_df = result_df.drop(columns=['Title_normalized'])
result_df

Unnamed: 0,Title,Skills
0,Accountants and Auditors,"[Microsoft PowerPoint, operations research, ac..."
1,Actuaries,"[Microsoft PowerPoint, Python, Structured quer..."
2,Administrative Services Managers,"[information management, Microsoft PowerPoint,..."
3,Advertising Sales Agents,"[Adobe Creative Cloud software, Microsoft Powe..."
4,Advertising and Promotions Managers,"[Microsoft PowerPoint, Microsoft Office softwa..."
...,...,...
159,Validation Engineers,"[Microsoft Excel, Microsoft PowerPoint, Micros..."
160,Video Game Designers,"[JavaScript, Adobe Photoshop, Oracle Java, C, ..."
161,Web Administrators,"[Cascading style sheets CSS, Adobe Photoshop, ..."
162,Web Developers,"[Spring Framework, Cascading style sheets CSS,..."


In [36]:
analysis_data_combined = calc_similarity_one_cv(df_resumes, result_df, parallel=True)
analysis_data_combined.head(20)

Unnamed: 0,Title,similarity_score,rank
102,Market Research Analysts and Marketing Special...,0.654053,1.0
55,Financial Managers,0.653941,2.0
108,Online Merchants,0.642066,3.0
0,Accountants and Auditors,0.632429,4.0
98,Logistics Engineers,0.626528,5.0
58,Financial and Investment Analysts,0.621309,6.0
121,"Purchasing Agents, Except Wholesale, Retail, a...",0.620888,7.0
103,Marketing Managers,0.618761,8.0
122,Purchasing Managers,0.616892,9.0
43,Data Warehousing Specialists,0.616785,10.0


**Example**

In [39]:
df_resumes['Skills'].value_counts()

Skills
[business, finance, business administration, customer relationship management, documentation, software, business intelligence]    1
Name: count, dtype: int64

In [40]:

#df_relevant[df_relevant['Title'] == 'Online Merchants']['raw'].values[0]
result_df[result_df['Title'] == 'Billing and Posting Clerks']['Skills'].values[0]


['Microsoft PowerPoint',
 'Microsoft Office software',
 'operations research',
 'Microsoft Word',
 'Microsoft Excel',
 'accounting',
 'Microsoft Outlook']

### Different calculation of similairty considering exact matching words

#### Add 1 word per each match, add to score and scale 0-1

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import os

def calc_similarity_add_words(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions,
    combining semantic similarity (via MPNET embeddings) and explicit keyword matching.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
   
    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    def explicit_match(title, skills, keywords):
        """
        Return the number of exact keyword matches in title or skills.
        """
        # Normalize the lists to lowercase and strip whitespace
        title_words = set(title.lower().split())
        skill_words = set(skill.lower().strip() for skill in skills)
        keyword_set = set(keyword.lower().strip() for keyword in keywords)

        # Check for exact matches in title or skills
        title_match = keyword_set & title_words  # Intersection of sets for title
        skills_match = keyword_set & skill_words  # Intersection of sets for skills

        # Return the total number of matches
        return len(title_match) + len(skills_match)
    
    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute semantic similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()
    job_df['semantic_similarity'] = similarity_scores

    # Compute explicit matches and word_match_contribution
    cv_skills = cv_df['Skills'].iloc[0]
    job_df['word_matches'] = job_df.apply(
        lambda row: explicit_match(row['Title'], row['Skills'], cv_skills),
        axis=1
    )


    # Compute the final score
    job_df['final_score'] = (
        job_df['semantic_similarity'] +  # 80% weight for semantic similarity
        job_df['word_match_contribution']  # 20% weight for normalized word match contribution
    )

    # Handle edge case: if all contributions are the same, set a neutral normalized value
    min_score = job_df['final_score'].min()
    max_score = job_df['final_score'].max()
    if min_score == max_score:
        job_df['final_score'] = 0.5  # Set all to a neutral value
    else:
        # Normalize word_match_contribution between 0 and 1
        job_df['final_score'] = (
            (job_df['final_score'] - min_score) /
            (max_score - min_score)
        )


    # Rank by final score
    job_df['rank'] = job_df['final_score'].rank(ascending=False)

    # Sort and return results with all relevant columns
    result_df_sorted = job_df[['Title', 'semantic_similarity', 'word_matches', 
                               'word_match_contribution_raw', 'word_match_contribution', 
                               'final_score', 'rank']].sort_values(by='rank', ascending=True)

    return result_df_sorted


In [None]:
analysis_data_words = calc_similarity_add_words(df_resumes, result_df, parallel=True)


#### function with 20% of weight for matching words

In [62]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import os

def calc_similarity_with_explicit_match(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions,
    combining semantic similarity (via MPNET embeddings) and explicit keyword matching.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
   
    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    def explicit_match(title, skills, keywords):
        """
        Return the number of exact keyword matches in title or skills.
        """
        # Normalize the lists to lowercase and strip whitespace
        title_words = set(title.lower().split())
        skill_words = set(skill.lower().strip() for skill in skills)
        keyword_set = set(keyword.lower().strip() for keyword in keywords)

        # Check for exact matches in title or skills
        title_match = keyword_set & title_words  # Intersection of sets for title
        skills_match = keyword_set & skill_words  # Intersection of sets for skills

        # Return the total number of matches
        return len(title_match) + len(skills_match)
    
    def calculate_word_match_contribution(row, cv_skills):
        # Count the number of matches
        matches = explicit_match(row['Title'], row['Skills'], cv_skills)
        # Calculate the raw word_match_contribution
        raw_contribution = matches * row['semantic_similarity']
        return matches, raw_contribution

    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute semantic similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()
    job_df['semantic_similarity'] = similarity_scores

    # Compute explicit matches and word_match_contribution
    cv_skills = cv_df['Skills'].iloc[0]
    job_df['word_matches'], job_df['word_match_contribution_raw'] = zip(*job_df.apply(
        lambda row: calculate_word_match_contribution(row, cv_skills), axis=1
    ))

    # Handle edge case: if all contributions are the same, set a neutral normalized value
    min_contribution = job_df['word_match_contribution_raw'].min()
    max_contribution = job_df['word_match_contribution_raw'].max()
    if min_contribution == max_contribution:
        job_df['word_match_contribution'] = 0.5  # Set all to a neutral value
    else:
        # Normalize word_match_contribution between 0 and 1
        job_df['word_match_contribution'] = (
            (job_df['word_match_contribution_raw'] - min_contribution) /
            (max_contribution - min_contribution)
        )

    # Compute the final score
    job_df['final_score'] = (
        0.8 * job_df['semantic_similarity'] +  # 80% weight for semantic similarity
        0.2 * job_df['word_match_contribution']  # 20% weight for normalized word match contribution
    )

    # Rank by final score
    job_df['rank'] = job_df['final_score'].rank(ascending=False)

    # Sort and return results with all relevant columns
    result_df_sorted = job_df[['Title', 'semantic_similarity', 'word_matches', 
                               'word_match_contribution_raw', 'word_match_contribution', 
                               'final_score', 'rank']].sort_values(by='rank', ascending=True)

    return result_df_sorted


In [63]:
analysis_data_new = calc_similarity_with_explicit_match(df_resumes, result_df, parallel=True)


In [64]:
analysis_data_new.head(40)

Unnamed: 0,Title,semantic_similarity,word_matches,word_match_contribution_raw,word_match_contribution,final_score,rank
102,Market Research Analysts and Marketing Special...,0.859978,6,5.159871,0.919818,0.871946,1.0
3,Advertising Sales Agents,0.801381,7,5.609666,1.0,0.841105,2.0
108,Online Merchants,0.873433,4,3.493733,0.622806,0.823308,3.0
137,Search Marketing Strategists,0.794199,6,4.765194,0.849461,0.805251,4.0
103,Marketing Managers,0.810743,4,3.242973,0.578104,0.764215,5.0
78,Fundraising Managers,0.811423,3,2.43427,0.433942,0.735927,6.0
4,Advertising and Promotions Managers,0.776351,4,3.105404,0.553581,0.731797,7.0
121,"Purchasing Agents, Except Wholesale, Retail, a...",0.803697,3,2.411092,0.42981,0.72892,8.0
77,Fundraisers,0.801172,3,2.403517,0.42846,0.72663,9.0
6,Billing and Posting Clerks,0.795235,3,2.385706,0.425285,0.721245,10.0


#### Example Check

In [100]:
df_resumes['Skills'].value_counts()

Skills
[marketing, communications, Canva, Microsoft Word, Microsoft Excel, Microsoft PowerPoint, advertising, google analytics, WordPress, webflow, structured text]    1
Name: count, dtype: int64

In [None]:
result_df[result_df['Title'] == 'Sales Representatives, Wholesale and Manufacturing, Technical and Scientific Products']['Skills'].values[0]


['Microsoft Office software',
 'Microsoft PowerPoint',
 'Salesforce software',
 'Microsoft Outlook',
 'engineering',
 'Microsoft Excel',
 'operations research']

In [105]:
result_df[result_df['Title'] == 'Operations Research Analysts']['Skills'].values[0]


['Python',
 'software',
 'Salesforce software',
 'Microsoft Outlook',
 'Microsoft Office software',
 'Microsoft Excel',
 'Microsoft Power BI',
 'Microsoft PowerPoint',
 'Structured query language SQL',
 'Tableau',
 'operations research']

In [None]:
result_df[result_df['Title'] == 'Search Marketing Strategists']['Skills'].values[0]



['Python',
 'Microsoft Office software',
 'marketing',
 'Microsoft Excel',
 'JavaScript',
 'Hypertext markup language HTML',
 'operations research',
 'Microsoft PowerPoint',
 'Google Analytics',
 'Cascading style sheets CSS',
 'WordPress',
 'search engine']

#### with 3 diffefrent weight for words matching based on list lenght

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import os

def calc_similarity_with_dynamic_weight_lenght(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions,
    combining semantic similarity (via MPNET embeddings) and explicit keyword matching,
    with dynamic word match weight adjustments based on the length of the skill lists.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"

    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    def explicit_match(title, skills, keywords):
        """
        Return the number of exact keyword matches in title or skills.
        """
        # Normalize the lists to lowercase and strip whitespace
        title_words = set(title.lower().split())
        skill_words = set(skill.lower().strip() for skill in skills)
        keyword_set = set(keyword.lower().strip() for keyword in keywords)

        # Check for exact matches in title or skills
        title_match = keyword_set & title_words  # Intersection of sets for title
        skills_match = keyword_set & skill_words  # Intersection of sets for skills

        # Return the total number of matches
        return len(title_match) + len(skills_match)

    def calculate_word_match_contribution(row, cv_skills, max_word_match_contribution):
        # Calculate matches and contribution
        matches = explicit_match(row['Title'], row['Skills'], cv_skills)
        contribution = matches * row['semantic_similarity']
        # Normalize contribution based on max possible
        return contribution / max_word_match_contribution if max_word_match_contribution > 0 else 0

    def determine_word_match_weight(skill_count):
        """Determine the weight of word matches based on the number of skills."""
        if skill_count <= 3:
            return 0.10  # Short skill list
        elif skill_count <= 10:
            return 0.20  # Medium skill list
        else:
            return 0.25  # Long skill list

    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute semantic similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()
    job_df['semantic_similarity'] = similarity_scores

    # Compute word matches and contribution
    cv_skills = cv_df['Skills'].iloc[0]
    job_df['word_matches'] = job_df.apply(
        lambda row: explicit_match(row['Title'], row['Skills'], cv_skills),
        axis=1
    )
    max_word_match_contribution = job_df.apply(
        lambda row: row['word_matches'] * row['semantic_similarity'], axis=1
    ).max()

    job_df['word_match_contribution'] = job_df.apply(
        lambda row: calculate_word_match_contribution(row, cv_skills, max_word_match_contribution),
        axis=1
    )

    # Dynamically adjust the weight of word match contribution
    job_df['word_match_weight'] = job_df['Skills'].apply(
        lambda skills: determine_word_match_weight(len(skills))
    )

    # Compute the final score as a weighted combination
    job_df['final_score'] = (
        (1 - job_df['word_match_weight']) * job_df['semantic_similarity'] +
        job_df['word_match_weight'] * job_df['word_match_contribution']
    )

    # Sort by the final score
    result_df_sorted = job_df.sort_values(by='final_score', ascending=False).reset_index(drop=True)

    # Include relevant columns in the output
    result_df_sorted = result_df_sorted[[
        'Title', 'semantic_similarity', 'word_matches', 'word_match_contribution',
        'word_match_weight', 'final_score'
    ]]

    return result_df_sorted


In [None]:
analysis_data_new = calc_similarity_with_dynamic_weight_lenght(df_resumes, result_df, parallel=True)
analysis_data_new