In [10]:
import sys
# Add the parent directory to sys.path
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import main
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [8]:
growth_df = pd.read_csv("data_from_girls/growth_df.csv")
df_skill_role_grouped = pd.read_csv("data_from_girls/hard_skills.csv")
similarity_df = pd.read_csv("data_from_girls/similarity_df.csv")
titles_df = pd.read_csv("data_from_girls/titles_df.csv")
merged_df = pd.read_csv("data_from_girls/merged_df.csv")

In [13]:
df_resumes = main.get_resumes("../resumes")
df_resumes = main.resume_extraction(df_resumes)

In [None]:
def similarity_final_3(cv_df, job_df=merged_df, hard_skills=df_skill_role_grouped, scores_df=similarity_df, titles_df = titles_df, growth_df = growth_df, role=None, wage = 0, parallel=False):
    """
    Calculate cosine similarity between a single CV and multiple job descriptions.
    Optionally filter based on a specific role and include role-specific scores in the output.
    """
    # Initialize the model
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side = "right"
    model.eval()

    def add_eos(input_examples):
        """Helper function to add special tokens between each skill."""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    def missing_skills(skills, cv):
        """Return a list of words from the job description (title and skills) that do not match any keywords in the CV."""
        skill_words = set(skill.lower().strip() for skill in skills if isinstance(skill, str))
        cv_set = set(cv.lower().strip() for cv in cv if isinstance(cv, str))
        missing_words = skill_words - cv_set
        return list(missing_words)

    # Find the generic title (Role) corresponding to the input title (Reported Job Title)
    generic_title = titles_df.loc[titles_df['Reported Job Title'] == role, 'Title'].iloc[0]
    
    # Filter scores_df using the generic title
    filtered_scores_df = scores_df[scores_df['Target_Role'] == generic_title]
    # Map the filtered scores to a dictionary for quick lookup
    role_score_mapping = dict(zip(filtered_scores_df['Role'], filtered_scores_df['composite_tasks_dwas_ksas']))


    # Extract and process the CV's skills
    cv_df['Skills_Text'] = cv_df['Skills'].apply(add_eos)
    cv_df['Skills_Text'] = cv_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    cv_embedding = model.encode(
        cv_df['Skills_Text'].iloc[0],  # Assuming a single CV is provided
        batch_size=1,
        show_progress_bar=False
    )

    # Preprocess the job descriptions' skills
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(
        lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else ''
    )
    job_embeddings = model.encode(
        job_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity
    similarity_scores = cosine_similarity([cv_embedding], job_embeddings).flatten()
    job_df['similarity_score'] = similarity_scores

    # Normalize similarity scores using z-score
    mean_score = job_df['similarity_score'].mean()
    std_score = job_df['similarity_score'].std()
    if std_score > 0:
        job_df['normalized_similarity_score'] = (job_df['similarity_score'] - mean_score) / std_score
    else:
        job_df['normalized_similarity_score'] = 0  # Assign 0 if std is zero (all scores are identical)

    # Rank the jobs by normalized similarity score
    job_df['rank'] = job_df['normalized_similarity_score'].rank(ascending=False)

    # Add missing skills
    job_df['missing_skills'] = job_df.apply(
        lambda row: missing_skills(
            hard_skills.loc[row.name, 'Skills'],  # Skills from df_skill_role_grouped
            cv_df['Skills'].iloc[0]  # Skills from the first row of df_resumes
        ),
        axis=1
    )

    # Map the growth information using generic title
    role_growth_mapping = dict(zip(growth_df['Occupation'], growth_df['Categories']))
    job_df['role_growth'] = job_df['Title'].map(role_growth_mapping)

    # Fill NaN values in 'role_growth' with 'Not In-Demand'
    job_df['role_growth'].fillna('Not In-Demand', inplace=True)

    # Check for missing salary information
    job_df['annual_wage_variation'] = job_df.apply(
        lambda row: "Info Not Available" if row['annual_wage'] == 0 else row['annual_wage'] - wage,
        axis=1
    )

    job_df_2 = job_df.copy()
    # Add the role-specific scores to a new column
    job_df_2['role_scores'] = job_df_2['Title'].apply(lambda x: role_score_mapping.get(x, None))
    # Rank the jobs by normalized similarity score
    job_df_2['rank'] = job_df_2['role_scores'].rank(ascending=False)

    # Return a DataFrame with the job titles, similarity scores, normalized scores, role-specific scores, and ranks
    ranked_jobs_2 = job_df_2[['rank', 'Title', 'missing_skills', 'annual_wage_variation', 'role_growth']].sort_values(by='rank', ascending=True)
    ranked_jobs = job_df[['rank', 'Title', 'missing_skills', 'annual_wage_variation', 'role_growth']].sort_values(by='rank', ascending=True)
    return ranked_jobs, ranked_jobs_2


In [16]:
foo, bar = similarity_final_3(df_resumes, role="Executive Director")

In [17]:
foo

Unnamed: 0,rank,Title,missing_skills,annual_wage_variation,role_growth
70,1.0,Clinical and Counseling Psychologists,"[, t, [, i, m, f, r, ,, o, e, c, z, s, ', a, ]]",106600,Rapid Growth
317,4.5,Radiation Therapists,"[, t, i, p, y, m, e, c, s, ', [, ]]",104420,Not In-Demand
282,4.5,"Orthopedic Surgeons, Except Pediatric","[, t, i, p, y, m, e, c, s, ', [, ]]",378250,Not In-Demand
185,4.5,General Internal Medicine Physicians,"[, t, i, p, y, m, e, c, s, ', [, ]]",245450,Not In-Demand
388,4.5,Urologists,"[, t, i, p, y, m, e, c, s, ', [, ]]",248640,Not In-Demand
...,...,...,...,...,...
308,400.0,Project Management Specialists,"[, p, m, f, ', i, k, ,, o, t, u, w, r, l, d, c...",104920,Rapid Growth
25,401.0,Atmospheric and Space Scientists,"[t, p, y, h, o, ', [, n, ]]",101530,Rapid Growth
113,402.0,Document Management Specialists,"[, p, m, f, ', i, k, ,, h, o, t, u, w, r, l, d...",112430,Rapid Growth
254,403.0,Medical Dosimetrists,"[, i, p, d, l, e, c, s, ', [, ]]",131850,Not In-Demand


In [18]:
bar

Unnamed: 0,rank,Title,missing_skills,annual_wage_variation,role_growth
186,1.0,General and Operations Managers,"[, p, m, f, ', i, ,, o, t, w, r, l, c, [, ], e...",129330,Rapid Growth; Numerous Job Openings
258,2.0,Medical and Health Services Managers,"[, t, i, k, m, u, f, r, l, ,, o, e, c, s, ', [...",134440,Rapid Growth
116,3.0,"Education Administrators, Kindergarten through...","[, p, m, f, ', i, ,, o, t, w, r, l, c, [, ], e...",111020,Not In-Demand
349,4.0,Social and Community Service Managers,"[, p, m, f, ', i, k, ,, o, t, u, w, r, l, c, [...",83400,Rapid Growth
335,5.0,Sales Managers,"[, p, m, f, ', i, k, ,, o, t, u, w, r, l, c, a...",157610,Rapid Growth
...,...,...,...,...,...
336,,"Sales Representatives of Services, Except Adve...","[, m, f, ', i, k, ,, o, t, u, w, r, l, c, [, a...",81080,Numerous Job Openings
343,,Security Management Specialists,"[, p, m, y, f, z, ', v, i, ,, h, o, b, t, u, w...",89130,Numerous Job Openings
344,,Security Managers,"[, p, m, f, ', i, ,, o, t, w, r, l, c, [, ], e...",111110,Not In-Demand
351,,Software Developers,"[, p, m, y, f, z, ', v, i, k, ,, h, g, o, b, ....",138110,Rapid Growth; Numerous Job Openings


In [19]:
df_resumes

Unnamed: 0,name,raw,Skills,Skills_Text
0,hanna_pedersen,\n \n \n \n \n \n \n \n \nEDUCATION \n \nNova...,"[business, analytics, business administration,...",analytics</s> artificial intelligence</s> busi...
1,irene_abbateli,"\n \n \nIRENE ABBATELLI \nRome, Italy \n(+39)...","[business, analytics, data analysis, machine l...",advertising</s> analytics</s> business adminis...
2,Luca_Oeztekin,"LUCA OEZTEKIN\nLisbon, Portugal & Cologne, Ger...","[analytics, artificial intelligence, modelling...",analytics</s> api</s> artificial intelligence<...
3,Tim_Gunkel,TIM GUNKEL \n \n \n \n \n \n \n ...,"[business, analytics, machine learning, busine...",analytics</s> business administration</s> busi...
4,Tim_gunkel2,TIM GUNKEL \n \n \n \n \n \n \n ...,"[business, analytics, machine learning, busine...",analytics</s> api</s> business administration<...
5,victor_bjorsvik,"VICTOR BJORSVIK \n \nLisbon, Portugal | +47 91...","[accounting, big data, communications, busines...",accounting</s> analytics</s> apache spark</s> ...


In [20]:
cosine_similarity_df, role_score_df  = similarity_final_3(df_resumes, role = 'Brand Manager', wage = 75000)

In [None]:
main.get_resumes("../resumes")
df_resumes = main.resume_extraction(df_resumes)

Unnamed: 0,rank,Title,missing_skills,annual_wage_variation,role_growth
70,1.0,Clinical and Counseling Psychologists,"[, t, [, i, m, f, r, ,, o, e, c, z, s, ', a, ]]",31600,Rapid Growth
317,4.5,Radiation Therapists,"[, t, i, p, y, m, e, c, s, ', [, ]]",29420,Not In-Demand
282,4.5,"Orthopedic Surgeons, Except Pediatric","[, t, i, p, y, m, e, c, s, ', [, ]]",303250,Not In-Demand
185,4.5,General Internal Medicine Physicians,"[, t, i, p, y, m, e, c, s, ', [, ]]",170450,Not In-Demand
388,4.5,Urologists,"[, t, i, p, y, m, e, c, s, ', [, ]]",173640,Not In-Demand
...,...,...,...,...,...
308,400.0,Project Management Specialists,"[, p, m, f, ', i, k, ,, o, t, u, w, r, l, d, c...",29920,Rapid Growth
25,401.0,Atmospheric and Space Scientists,"[t, p, y, h, o, ', [, n, ]]",26530,Rapid Growth
113,402.0,Document Management Specialists,"[, p, m, f, ', i, k, ,, h, o, t, u, w, r, l, d...",37430,Rapid Growth
254,403.0,Medical Dosimetrists,"[, i, p, d, l, e, c, s, ', [, ]]",56850,Not In-Demand
