### Import

In [76]:
import os
import pandas as pd
from services.ResumeInfoExtraction import ResumeInfoExtraction
from services.JobInfoExtraction import JobInfoExtraction
from source.schemas.resumeextracted import ResumeExtractedModel # Let's reintroduce later on
from source.schemas.jobextracted import JobExtractedModel # Let's reintroduce later on
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt
import markdown
import warnings 
import logging
import torch
import torch.nn.functional as F

logging.getLogger('pypdf').setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

# Get the absolute path of the root directory
ROOT_DIR = os.path.abspath(r'C:\Users\irene\OneDrive\Desktop\Thesis\MAIN_CODE')

# Paths to your pattern files
skills_patterns_path = os.path.join(ROOT_DIR, 'workproject_matching_algo','Resources', 'data', 'skills.jsonl')

In [77]:
import main
import pandas as pd
import os
import matplotlib.pyplot as plt


In [166]:
df_resumes = main.get_resumes('resumes')

In [167]:
df_resumes = main.resume_extraction(df_resumes)
df_resumes

Unnamed: 0,name,raw,Skills
0,Irene Abbatelli CV,"\n \n \nIRENE ABBATELLI \nRome, Italy \n(+39)...","[business, analytics, data analysis, data visu..."


In [None]:
df_resumes_first_row = df_resumes.iloc[]
df_resumes_first_row

name                                                      CV_1_9
raw            **John Doe**   \nLondon, UK   \njohn.doe@email...
Skills         [modelling, machine learning, business, projec...
Skills_Text    analytics</s> aws</s> azure</s> business</s> c...
Name: 4, dtype: object

*conda install -c conda-forge openpyxl*

### Skill_Role df

In [168]:
file_path = os.path.join('skills_roles_data', 'Technology Skills.xlsx')
df_skill_role = pd.read_excel(file_path)
df_skill_role

Unnamed: 0,O*NET-SOC Code,Title,Example,Commodity Code,Commodity Title,Hot Technology,In Demand
0,11-1011.00,Chief Executives,Adobe Acrobat,43232202,Document management software,Y,N
1,11-1011.00,Chief Executives,AdSense Tracker,43232306,Data base user interface and query software,N,N
2,11-1011.00,Chief Executives,Atlassian JIRA,43232201,Content workflow software,Y,N
3,11-1011.00,Chief Executives,Blackbaud The Raiser's Edge,43232303,Customer relationship management CRM software,N,N
4,11-1011.00,Chief Executives,ComputerEase construction accounting software,43231601,Accounting software,N,N
...,...,...,...,...,...,...,...
32622,53-7121.00,"Tank Car, Truck, and Ship Loaders",Linux,43233004,Operating system software,Y,N
32623,53-7121.00,"Tank Car, Truck, and Ship Loaders",Microsoft Excel,43232110,Spreadsheet software,Y,N
32624,53-7121.00,"Tank Car, Truck, and Ship Loaders",Microsoft Office software,43231513,Office suite software,Y,N
32625,53-7121.00,"Tank Car, Truck, and Ship Loaders",SAP software,43231602,Enterprise resource planning ERP software,Y,N


In [169]:
df_skill_role = df_skill_role.drop(columns=['O*NET-SOC Code', 'Commodity Code', 'Commodity Title'])

In [170]:
df_skill_role_filtered = df_skill_role[(df_skill_role['Hot Technology'] == 'Y') & (df_skill_role['In Demand'] == 'Y')]
df_skill_role_filtered

Unnamed: 0,Title,Example,Hot Technology,In Demand
27,Chief Executives,Microsoft Excel,Y,Y
29,Chief Executives,Microsoft Office software,Y,Y
31,Chief Executives,Microsoft PowerPoint,Y,Y
57,Chief Sustainability Officers,Microsoft Office software,Y,Y
140,General and Operations Managers,Microsoft Excel,Y,Y
...,...,...,...,...
32460,Aviation Inspectors,Microsoft Excel,Y,Y
32464,Aviation Inspectors,Microsoft Word,Y,Y
32474,"Transportation Vehicle, Equipment and Systems ...",Microsoft Excel,Y,Y
32475,"Transportation Vehicle, Equipment and Systems ...",Microsoft Office software,Y,Y


In [171]:
df_skill_role_grouped = df_skill_role_filtered.groupby('Title')['Example'].apply(list).reset_index()
df_skill_role_grouped.rename(columns={'Example': 'Skills'}, inplace=True)
df_skill_role_grouped

Unnamed: 0,Title,Skills
0,Accountants and Auditors,"[Intuit QuickBooks, Microsoft Excel, Microsoft..."
1,Actuaries,"[Microsoft Excel, Microsoft Office software, M..."
2,Administrative Services Managers,"[Microsoft Excel, Microsoft Office software, M..."
3,"Adult Basic Education, Adult Secondary Educati...","[Microsoft Excel, Microsoft Office software]"
4,Advertising Sales Agents,"[Adobe Creative Cloud software, Adobe Illustra..."
...,...,...
448,Wind Energy Engineers,"[C++, Microsoft Excel, Python, The MathWorks M..."
449,Wind Turbine Service Technicians,"[Microsoft Office software, SAP software]"
450,Word Processors and Typists,"[Microsoft Excel, Microsoft Office software, M..."
451,Writers and Authors,"[Adobe Photoshop, Microsoft Excel, Microsoft O..."


In [172]:
def calc_similarity_one_cv(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions 
    based on MPNET embeddings of combined skills.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
    model.eval()

    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()

    # Add similarity scores to the job descriptions DataFrame
    job_df['similarity_score'] = similarity_scores

    # Rank the jobs by similarity score
    job_df['rank'] = job_df['similarity_score'].rank(ascending=False)

    # Return a DataFrame with the job titles, similarity scores, and ranks
    ranked_jobs = job_df[['Title', 'similarity_score', 'rank']].sort_values(by='rank', ascending=True)
    return ranked_jobs

analysis_data = calc_similarity_one_cv(df_resumes, df_skill_role_grouped, parallel=True)

In [None]:
def calc_similarity_one_cv(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions 
    based on MPNET embeddings of combined skills.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
    model.eval()

    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()

    # Add similarity scores to the job descriptions DataFrame
    job_df['similarity_score'] = similarity_scores

    # Rank the jobs by similarity score
    job_df['rank'] = job_df['similarity_score'].rank(ascending=False)

    # Return a DataFrame with the job titles, similarity scores, and ranks
    ranked_jobs = job_df[['Title', 'similarity_score', 'rank']].sort_values(by='rank', ascending=True)
    return ranked_jobs

analysis_data = calc_similarity_one_cv(df_resumes, df_skill_role_grouped, parallel=True)

In [173]:
analysis_data.head(20)

Unnamed: 0,Title,similarity_score,rank
404,Statisticians,0.531196,1.0
41,Biostatisticians,0.487768,2.0
403,Statistical Assistants,0.476898,3.0
165,Financial Risk Specialists,0.476652,4.0
124,Economists,0.469297,5.0
207,Geneticists,0.457202,6.0
50,Business Intelligence Analysts,0.444778,7.0
267,Management Analysts,0.443104,8.0
1,Actuaries,0.441137,9.0
166,Financial and Investment Analysts,0.436911,10.0


### Job Description df

In [136]:
file_path_1 = os.path.join('skills_roles_data', 'Occupation Data.xlsx')
df_jobdescription = pd.read_excel(file_path_1)
df_jobdescription

Unnamed: 0,O*NET-SOC Code,Title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."
...,...,...,...
1011,55-3014.00,Artillery and Missile Crew Members,"Target, fire, and maintain weapons used to des..."
1012,55-3015.00,Command and Control Center Specialists,"Operate and monitor communications, detection,..."
1013,55-3016.00,Infantry,Operate weapons and equipment in ground combat...
1014,55-3018.00,Special Forces,"Implement unconventional operations by air, la..."


In [137]:
df_jobdescription.rename(columns={'Description': 'raw'}, inplace=True)
df_jobdescription = df_jobdescription.drop(columns=['O*NET-SOC Code'])

In [175]:
# Get unique titles from df_skill_role_grouped
unique_titles = df_skill_role_grouped['Title'].unique()

# Filter df_jobs to only include rows with titles in unique_titles
filtered_df_jobs = df_jobdescription[df_jobdescription['Title'].isin(unique_titles)]

filtered_df_jobs

Unnamed: 0,Title,raw,Skills,Skills_Text,similarity_score,rank
0,Chief Executives,Determine and formulate policies and provide o...,[operations research],operations research</s>,0.489298,320.5
1,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",[operations research],operations research</s>,0.489298,320.5
2,General and Operations Managers,"Plan, direct, or coordinate the operations of ...",[operations research],operations research</s>,0.489298,320.5
3,Legislators,"Develop, introduce, or enact laws and statutes...",[operations research],operations research</s>,0.489298,320.5
4,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...","[operations research, advertising]",advertising</s> operations research</s>,0.585841,12.0
...,...,...,...,...,...,...
967,"Captains, Mates, and Pilots of Water Vessels",Command or supervise operations of ships and w...,[operations research],operations research</s>,0.489298,320.5
974,Traffic Technicians,Conduct field studies to determine traffic vol...,[traffic engineering],traffic engineering</s>,0.368837,670.0
975,Transportation Inspectors,Inspect equipment or goods in connection with ...,[operations research],operations research</s>,0.489298,320.5
976,Aviation Inspectors,"Inspect aircraft, maintenance procedures, air ...",[communications],communications</s>,0.224348,750.5


In [177]:
df_jobs = main.job_info_extraction(filtered_df_jobs)
df_jobs

Unnamed: 0,Title,raw,Skills,Skills_Text,similarity_score,rank
0,Chief Executives,Determine and formulate policies and provide o...,[operations research],operations research</s>,0.489298,320.5
1,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",[operations research],operations research</s>,0.489298,320.5
2,General and Operations Managers,"Plan, direct, or coordinate the operations of ...",[operations research],operations research</s>,0.489298,320.5
3,Legislators,"Develop, introduce, or enact laws and statutes...",[operations research],operations research</s>,0.489298,320.5
4,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...","[operations research, advertising]",advertising</s> operations research</s>,0.585841,12.0
...,...,...,...,...,...,...
967,"Captains, Mates, and Pilots of Water Vessels",Command or supervise operations of ships and w...,[operations research],operations research</s>,0.489298,320.5
974,Traffic Technicians,Conduct field studies to determine traffic vol...,[traffic engineering],traffic engineering</s>,0.368837,670.0
975,Transportation Inspectors,Inspect equipment or goods in connection with ...,[operations research],operations research</s>,0.489298,320.5
976,Aviation Inspectors,"Inspect aircraft, maintenance procedures, air ...",[communications],communications</s>,0.224348,750.5


In [None]:
def calc_similarity_one_cv(cv_df, job_df, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions 
    based on MPNET embeddings of combined skills.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
    model.eval()

    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()

    # Add similarity scores to the job descriptions DataFrame
    job_df['similarity_score'] = similarity_scores

    # Rank the jobs by similarity score
    job_df['rank'] = job_df['similarity_score'].rank(ascending=False)

    # Return a DataFrame with the job titles, similarity scores, and ranks
    ranked_jobs = job_df[['Title', 'similarity_score', 'rank']].sort_values(by='rank', ascending=True)
    return ranked_jobs

In [178]:
analysis_data_1 = calc_similarity_one_cv(df_resumes, df_jobs, parallel=True)
analysis_data_1.head(20)

Unnamed: 0,Title,similarity_score,rank
142,Data Scientists,0.730054,1.0
86,Search Marketing Strategists,0.621133,2.0
109,Computer Systems Analysts,0.614566,3.0
12,Computer and Information Systems Managers,0.602137,4.0
358,Library Technicians,0.59078,6.0
129,Geographic Information Systems Technologists a...,0.59078,6.0
539,Intelligence Analysts,0.59078,6.0
85,Market Research Analysts and Marketing Special...,0.589108,8.0
4,Advertising and Promotions Managers,0.585841,10.0
664,Order Clerks,0.585841,10.0


### Similarity score calculation 