In [14]:
from transformers import AutoModel

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import gc
import torch

import json

In [21]:
# Load data
jobs_ts = pd.read_csv('../output/jobs_ts_cleaned_regex.csv') # jobs_ts_cleaned_regex, jobs_ts_cleaned_simsearch_regex
isco = pd.read_csv('../data/wi_labels.csv', dtype=str)


In [22]:
# choose from: title_desc_ts_postclean_regex_simsearch (in jobs_ts_cleaned_simsearch_regex), 
# title_desc_ts_postclean_regex (in jobs_ts_cleaned_regex), title_desc_ts_postclean (in jobs_ts_cleaned_regex)
jd_colname = "title_desc_ts_postclean" 

In [23]:
# Some last minute cleaning
isco['description'] = isco['description'].str.lower()

# Checking out notes/Some related occupations classified elsewhere” for ISCO
# These additional notes may end up saying how another classification would be better, and may confuse the rag
# so to remove them. Note, "notes" are always after "some related occupations..."
isco['description'] = (isco['description']
                       .str.replace(r'(notes\n.*)', '', regex=True)
                       .str.replace(r'(some related occupations classified elsewhere.*)', '', regex=True))


In [24]:
# create query (the jd) and document (isco) lists
jd = jobs_ts[jd_colname].to_list()
isco_desc = isco['description'].to_list()


In [25]:
# # Creating gold standard based on the recommendations made in previous models
"""
The whole idea for this part is so that we can have a sensing of what could be some sources of error, and of course having
to evaluate quality of responses without having to create a submission since we only have 10 submission attempts
Evaluation below was done for baseline bge-large v1.5

Conclusion:
Employment agents and contractors (3333) comes up a lot, possibly because of all the miscellaneous/non-JD related text e.g. APPLY Save Follow us Site language Magyar English About us Download our mobile application Profession Offers for employers Imprint International services Have a question Customer service Contact Terms and Conditions Terms of use Privacy Policy Profession 2021 All Rights Reserved. Login To save a job To save a job, enter your e-mail address or log in to your profile Forgot your password Login I will send it to myself Login with Facebook Login with Google account
keyword based approach may be useful and augmented onto this step
Romanian/bulgarian/lithuanian job descriptions seem quite problematic. A lot of misc stuff from websites/about HR company who's managing the job post. To confirm if this is the case

Need to remove "Some related occupations classified elsewhere" for isco dataset, without removing the notes after it. (Or may to consider removing the notes too, need to check first)

* 897858836 and 771344098 were not graded as there were 0 hints on the type of jobs they were -- just gibberish
"""

# Filtering just to the sampled, and gold standard isco codes
best_isco_dict = {"872828466":"7132",
                  "839465958":"2330", # is grade 7-9 secondary school? or high school. 2320 seems plausible too
                  "857077872":"2212", # consultants = specialists. maybe it doesn't understand that
                  "804595650":"2153", # Best guess, but JD was not descriptive at all
                  "785637891":"2634", # Not very descriptive JD
                  "843263945":"5223",
                  "822053239":"7523", # uninformative JD, text looks like buttons from a website
                  "823300143":"3334", # Not very informative, but highlights real estate multiple times
                  "834972267":"8322", # JD says car or bus driver. so 8322 or 8331 could be correct.
                  "793505712":"2514",
                  "766940573":"5113",
                  "830014473":"5223", # Was none of the top 3. 2434 (top label) is wholesale, not retail
                  "792868203":"9629", # Not descriptive at all. 
                  "844673659":"8211", # Not very descriptive, ambiguous JD
                  "824883616":"3434", # Literally no description
                  "891827176":"2431", # Big differences between the numbers. But it's more admin than the ISCO job description. maybe there will be a better suggestion next
                  "753741043":"7322",
                  "776400501":"8142", # Chemicals  vs plastics? Confused? maybe need keyword search?
                  "789188547":"2262",
                  "848225359":"1345", # Big differences between the numbers
                  "879064773":"9111", # 9111, hard to say which one. 9111 is the first pick tho, although 5152 seems slightly more relevant. It depends on the skill level, which is not quite stated in the JD
                  "775359643":"2433", # None of the 3 options were good. Too much focus on the business
                  "810131612":"7221",
                  "802436242":"7411", # No description of job
                  "754636130":"3321",
                  "779195013":"5131", # Very messy JD, has a lot of other jobs combined into 1.Mentions many diff countries, so maybe that's why 4221 (Travel consultants and clerks) came up tops
                  "799557704":"3112", # JD was different from the job title. That being said, civil engineering was mentioned quite a bit, in terms of buildings and blueprints etc
                  "844474920":"9412", # Only had job title!! the whole description was about the HR company. The only hint was "unskilled"
                  "872195174":"2151", # Non-descriptive, had to rely on isco hierarchy, seemed like a proper engineer instead of a technician
                  "836929936":"8152", # Close fight between 8152/8153, but 8152 seems all emcompassing
                  "887356686":"9333", # Totally wrong. Perhaps due to a technical terminology e.g. dock, and CACES
                  "877325404":"4222",
                  "794441431":"7111", # Not very descriptive. Not likely a supervisory role, hence 7111 instead of 3123
                  "891808036":"9321", # Only have job title
                  "847941059":"5322",
                  "786885741":"2431", # No tasks at all, only talks about the company. 2431 was the closest
                  "873295233":"3322",
                  "845150846":"2152",
                  "779705693":"9412",
                  "797628595":"9112", # 9129 might be better than 9112 due to it mentioning usage of machinery, which is slightly more special. Stick to 9112 for now
                  "828908699":"2141", # Was tough finding a job description for QA/QC. Maybe a better classification will come along
                  "863016564":"5244" # Most likely, but non-descriptive JD too.                 
                  } 

jobs_ts = jobs_ts[jobs_ts['id'].astype(str).isin(best_isco_dict.keys())].copy()

In [26]:
# create query (the jd) and document (isco) lists
jd = jobs_ts[jd_colname].to_list()
isco_desc = isco['description'].to_list()

In [27]:
# Initialize the model
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)


In [29]:
# load model

# Initialize the model
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)

# # 2. Encode ~ 111 minutes
# embeddings_jd = model.encode(jd, task="text-matching", show_progress_bar=True)
# embeddings_isco_desc = model.encode(isco_desc, task="text-matching", show_progress_bar=True)
# similarity = embeddings_jd @ embeddings_isco_desc.T

# 2. Encode ~ 111 minutes
embeddings_jd = model.encode(jd, task="retrieval.query", show_progress_bar=True)
embeddings_isco_desc = model.encode(isco_desc, task="retrieval.passage", show_progress_bar=True)
similarity = embeddings_jd @ embeddings_isco_desc.T


Encoding:   0%|          | 0/2 [00:00<?, ?it/s]

Encoding:   0%|          | 0/14 [00:00<?, ?it/s]

In [30]:
# Get top n documents
top_n = 5

indices = np.argsort(similarity)[:, -top_n:]
scores = np.take_along_axis(similarity, indices, axis=-1)

compiled_topn_codes = []
compiled_topn_desc = []
for idx in range(len(jd)):
    isco_code_topn = [isco['code'][top_idx] for top_idx in indices[idx].tolist()]
    isco_desc_topn = [isco['description'][top_idx] for top_idx in indices[idx].tolist()]
    compiled_topn_codes.append(isco_code_topn)
    compiled_topn_desc.append(isco_desc_topn)
    
jobs_ts['isco_code'] = [topn_codes[-1] for topn_codes in compiled_topn_codes]

In [31]:
def calc_lca(row):
    isco1 = row['manual_isco']
    isco2 = row['isco_code']
    if isco1[0] != isco2[0]:
        return 0
    elif isco1[1] != isco2[1]:
        return 0.25
    elif isco1[2] != isco2[2]:
        return 0.5
    elif isco1[3] != isco2[3]:
        return 0.75
    else:
        return 1

def calc_lca_df(df):
    eval_df = df[df['id'].astype(str).isin(best_isco_dict.keys())].copy()
    eval_df['manual_isco'] = eval_df['id'].astype(str).map(best_isco_dict)
    eval_df['lca'] = eval_df.apply(calc_lca, axis=1)
    avg_lca = eval_df['lca'].mean()
    print(f"Avg LCA: {avg_lca}") 
    return avg_lca

calc_lca_df(jobs_ts)  # 0.5357142857142857, 0.5952380952380952

Avg LCA: 0.5952380952380952


0.5952380952380952

In [13]:
# Save out top n descriptions and code
with open("../output/compiled_topn_desc.json", 'w') as f:
    json.dump(compiled_topn_desc, f) 

with open("../output/compiled_topn_codes.json", 'w') as f:
    json.dump(compiled_topn_codes, f) 


Cross encoder time: Try 3 diff rerankers, each have their own way of using the reranker

In [7]:
# Load top n descriptions and code again, if need to split the script into 2 runs
with open("../output/compiled_topn_desc.json", 'r') as f:
    compiled_topn_desc = json.load(f)
    
with open("../output/compiled_topn_codes.json", 'r') as f:
    compiled_topn_codes = json.load(f)


In [32]:
# Somehow this is the best way.. using sentence transformers led to constantly increasing ram usage, which slows things
# down considerably. ~ 8hrs
from transformers import AutoModelForSequenceClassification

tqdm.pandas(desc='Cross-encoding in progress')
model = AutoModelForSequenceClassification.from_pretrained(
    'jinaai/jina-reranker-v2-base-multilingual',
    torch_dtype="auto",
    trust_remote_code=True,
)

model.to("mps") 
model.eval()

# The description is the query
reranked_isco_codes = []
for idx, query in tqdm(enumerate(jd), total=len(jd)):
    # rerank the results with original query and documents returned from Chroma
    sentence_pairs = [(query, doc) for doc in compiled_topn_desc[idx]]
    scores = model.compute_score(sentence_pairs)
    # get the isco of the highest scoring document
    reranked_isco_codes.append(compiled_topn_codes[idx][np.argmax(scores)])
    # To prevent ram from accumulating, could be someting related to MPS
    gc.collect()
    torch.mps.empty_cache()
    



  0%|          | 0/42 [00:00<?, ?it/s]

In [33]:
# check LCA of reranked dataset
def calc_lca(row):
    isco1 = row['manual_isco']
    isco2 = row['isco_code']
    if isco1[0] != isco2[0]:
        return 0
    elif isco1[1] != isco2[1]:
        return 0.25
    elif isco1[2] != isco2[2]:
        return 0.5
    elif isco1[3] != isco2[3]:
        return 0.75
    else:
        return 1

def calc_lca_df(df):
    eval_df = df[df['id'].astype(str).isin(best_isco_dict.keys())].copy()
    eval_df['manual_isco'] = eval_df['id'].astype(str).map(best_isco_dict)
    eval_df['lca'] = eval_df.apply(calc_lca, axis=1)
    avg_lca = eval_df['lca'].mean()
    print(f"Avg LCA: {avg_lca}") 
    return avg_lca

jobs_ts_reranked = jobs_ts.copy()
jobs_ts_reranked['isco_code'] = reranked_isco_codes
calc_lca_df(jobs_ts_reranked) # 0.6964285714285714, 0.7142857142857143

Avg LCA: 0.7142857142857143


0.7142857142857143

In [34]:
# Check gold standard dataset
gs_reranked = jobs_ts_reranked[jobs_ts_reranked['id'].astype(str).isin(best_isco_dict.keys())].copy()
gs_reranked['manual_isco'] = gs_reranked['id'].astype(str).map(best_isco_dict)

In [13]:
# Prepare submission
jobs_ts_reranked[['id', 'isco_code']].to_csv('../submission/classification.csv', index=False, header=False)

Archive

In [None]:
# For msmarco
tqdm.pandas(desc='Cross-encoding in progress')
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2", max_length=512,
                     device="mps")

# The description is the query
reranked_isco_codes = []
for idx, query in tqdm(enumerate(jd), total=len(jd)):
    # rerank the results with original query and documents returned from Chroma
    sentence_pairs = [(query, doc) for doc in compiled_topn_desc[idx]]
    scores = model.predict(sentence_pairs, convert_to_tensor=True).tolist()
    # get the isco of the highest scoring document
    reranked_isco_codes.append(compiled_topn_codes[idx][np.argmax(scores)])
    
# check LCA of reranked dataset
jobs_ts_reranked = jobs_ts.copy()
jobs_ts_reranked['isco_code'] = reranked_isco_codes
calc_lca_df(jobs_ts_reranked) 

In [None]:
# For mxbai
tqdm.pandas(desc='Cross-encoding in progress')
model = CrossEncoder("mixedbread-ai/mxbai-rerank-large-v1",
                      device="mps")

# The description is the query
reranked_isco_codes = []
for idx, query in tqdm(enumerate(jd), total=len(jd)):
    # rerank the results with original query and documents
    sentence_pairs = [(query, doc) for doc in compiled_topn_desc[idx]]
    scores = model.predict(sentence_pairs, convert_to_tensor=True).tolist()
    # get the isco of the highest scoring document
    reranked_isco_codes.append(compiled_topn_codes[idx][np.argmax(scores)])
    
    gc.collect()
    torch.mps.empty_cache()
    
# check LCA of reranked dataset
jobs_ts_reranked = jobs_ts.copy()
jobs_ts_reranked['isco_code'] = reranked_isco_codes
calc_lca_df(jobs_ts_reranked) 

In [None]:
# For BGE
from FlagEmbedding import FlagReranker
tqdm.pandas(desc='Cross-encoding in progress')
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

# The description is the query
reranked_isco_codes = []
for idx, query in tqdm(enumerate(jd), total=len(jd)):
    # rerank the results with original query and documents returned from Chroma
    sentence_pairs = [[query, doc] for doc in compiled_topn_desc[idx]]
    scores = reranker.compute_score(sentence_pairs)
    # get the isco of the highest scoring document
    reranked_isco_codes.append(compiled_topn_codes[idx][np.argmax(scores)])
    
# check LCA of reranked dataset
jobs_ts_reranked = jobs_ts.copy()
jobs_ts_reranked['isco_code'] = reranked_isco_codes
calc_lca_df(jobs_ts_reranked)  
