In [1]:
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd
from sentence_transformers import CrossEncoder
import numpy as np
from tqdm.notebook import tqdm

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load data
jobs_ts = pd.read_csv('../output/jobs_ts_cleaned_regex.csv')
isco = pd.read_csv('../data/wi_labels.csv', dtype=str)


In [3]:
# # Creating gold standard based on the recommendations made in previous models
"""
The whole idea for this part is so that we can have a sensing of what could be some sources of error, and of course having
to evaluate quality of responses without having to create a submission since we only have 10 submission attempts
Evaluation below was done for baseline bge-large v1.5

Conclusion:
Employment agents and contractors (3333) comes up a lot, possibly because of all the miscellaneous/non-JD related text e.g. APPLY Save Follow us Site language Magyar English About us Download our mobile application Profession Offers for employers Imprint International services Have a question Customer service Contact Terms and Conditions Terms of use Privacy Policy Profession 2021 All Rights Reserved. Login To save a job To save a job, enter your e-mail address or log in to your profile Forgot your password Login I will send it to myself Login with Facebook Login with Google account
keyword based approach may be useful and augmented onto this step
Romanian/bulgarian/lithuanian job descriptions seem quite problematic. A lot of misc stuff from websites/about HR company who's managing the job post. To confirm if this is the case

Need to remove "Some related occupations classified elsewhere" for isco dataset, without removing the notes after it. (Or may to consider removing the notes too, need to check first)

* 897858836 and 771344098 were not graded as there were 0 hints on the type of jobs they were -- just gibberish
"""

# Filtering just to the sampled, and gold standard isco codes
best_isco_dict = {"872828466":"7132",
                  "839465958":"2330", # is grade 7-9 secondary school? or high school. 2320 seems plausible too
                  "857077872":"2212", # consultants = specialists. maybe it doesn't understand that
                  "804595650":"2153", # Best guess, but JD was not descriptive at all
                  "785637891":"2634", # Not very descriptive JD
                  "843263945":"5223",
                  "822053239":"7523", # uninformative JD, text looks like buttons from a website
                  "823300143":"3334", # Not very informative, but highlights real estate multiple times
                  "834972267":"8322", # JD says car or bus driver. so 8322 or 8331 could be correct.
                  "793505712":"2514",
                  "766940573":"5113",
                  "830014473":"5223", # Was none of the top 3. 2434 (top label) is wholesale, not retail
                  "792868203":"9629", # Not descriptive at all. 
                  "844673659":"8211", # Not very descriptive, ambiguous JD
                  "824883616":"3434", # Literally no description
                  "891827176":"2431", # Big differences between the numbers. But it's more admin than the ISCO job description. maybe there will be a better suggestion next
                  "753741043":"7322",
                  "776400501":"8142", # Chemicals  vs plastics? Confused? maybe need keyword search?
                  "789188547":"2262",
                  "848225359":"1345", # Big differences between the numbers
                  "879064773":"9111", # 9111, hard to say which one. 9111 is the first pick tho, although 5152 seems slightly more relevant. It depends on the skill level, which is not quite stated in the JD
                  "775359643":"2433", # None of the 3 options were good. Too much focus on the business
                  "810131612":"7221",
                  "802436242":"7411", # No description of job
                  "754636130":"3321",
                  "779195013":"5131", # Very messy JD, has a lot of other jobs combined into 1.Mentions many diff countries, so maybe that's why 4221 (Travel consultants and clerks) came up tops
                  "799557704":"3112", # JD was different from the job title. That being said, civil engineering was mentioned quite a bit, in terms of buildings and blueprints etc
                  "844474920":"9412", # Only had job title!! the whole description was about the HR company. The only hint was "unskilled"
                  "872195174":"2151", # Non-descriptive, had to rely on isco hierarchy, seemed like a proper engineer instead of a technician
                  "836929936":"8152", # Close fight between 8152/8153, but 8152 seems all emcompassing
                  "887356686":"9333", # Totally wrong. Perhaps due to a technical terminology e.g. dock, and CACES. 
                  "877325404":"4222",
                  "794441431":"7111", # Not very descriptive. Not likely a supervisory role, hence 7111 instead of 3123
                  "891808036":"9321", # Only have job title
                  "847941059":"5322",
                  "786885741":"2431", # No tasks at all, only talks about the company. 2431 was the closest
                  "873295233":"3322",
                  "845150846":"2152",
                  "779705693":"9412",
                  "797628595":"9112", # 9129 might be better than 9112 due to it mentioning usage of machinery, which is slightly more special. Stick to 9112 for now
                  "828908699":"2141", # Was tough finding a job description for QA/QC. Maybe a better classification will come along
                  "863016564":"5244" # Most likely, but non-descriptive JD too.                 
                  
                  } 

jobs_ts = jobs_ts[jobs_ts['id'].astype(str).isin(best_isco_dict.keys())].copy()

In [4]:
# Replace with your desired embedding model
# Tried models: BAAI/bge-large-en-v1.5, BAAI/bge-base-en-v1.5, BAAI/bge-m3, jinaai/jina-embeddings-v2-base-en, 
# "dunzhang/stella_en_400M_v5", mixedbread-ai/mxbai-embed-large-v1, nomic-ai/nomic-embed-text-v1, intfloat/multilingual-e5-large
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="BAAI/bge-m3",
                                                                              trust_remote_code=True)

# Create a ChromaDB client
client = chromadb.Client()

# Create a collection
collection = client.create_collection(
    name="isco",
    embedding_function=embedding_function,
        metadata={"hnsw:space": "cosine"}
)


In [5]:
# Add documents to the collection
collection.add(
    documents=isco['description'].to_list(),
    ids=isco['code'].to_list(),
    
)

In [6]:
# Document retrieval
results = collection.query(
    query_texts=jobs_ts['title_desc_ts_postclean_regex'], # Chroma will embed this for you
    n_results=5 # how many results to return
)

In [7]:
# Get retrieved ISCO code, distances, and top ranked isco code
jobs_ts['retrieved_isco_codes'] = [retrieved_isco for retrieved_isco in results['ids']]
jobs_ts['retrieved_distances'] = [retrieved_isco for retrieved_isco in results['distances']]
jobs_ts['isco_code'] = [retrieved_isco[0] for retrieved_isco in results['ids']]

In [8]:
# Sample a few rows per language and review retrieved code: For the initial review, but note the findings of the review
# and the creation of the "gold standard" dataset are above 
sampled_results = jobs_ts.groupby("lang")[['id', 'lang', 'retrieved_isco_codes', 'title_desc_ts_postclean_regex']].head(2)


In [9]:
# evaluation: 
# 1st try (bge-large-1.5, k=10): 0.631 # Done the proper way! 
# 2st try (bge-large-1.5, jinaai/jina-reranker-v2-base-multilingual, k=10): 0.565
# (mixedbread-ai/mxbai-embed-large-v1, k=10) 0.738
# (mixedbread-ai/mxbai-embed-large-v1 with jinaai/jina-reranker-v2-base-multilingual, k=10): 0.625
# (mixedbread-ai/mxbai-embed-large-v1 with jinaai/jina-reranker-v2-base-multilingual, k=5): 0.684
# 4th try (jina-embeddings-v2-base-en, k=10): 0.553
# 5th try (jina-embeddings-v2-base-en, jinaai/jina-reranker-v2-base-multilingual, k=10): 0.613
def calc_lca(row):
    isco1 = row['manual_isco']
    isco2 = row['isco_code']
    if isco1[0] != isco2[0]:
        return 0
    elif isco1[1] != isco2[1]:
        return 0.25
    elif isco1[2] != isco2[2]:
        return 0.5
    elif isco1[3] != isco2[3]:
        return 0.75
    else:
        return 1

def calc_lca_df(df):
    eval_df = df[df['id'].astype(str).isin(best_isco_dict.keys())].copy()
    eval_df['manual_isco'] = eval_df['id'].astype(str).map(best_isco_dict)
    eval_df['lca'] = eval_df.apply(calc_lca,axis=1)
    avg_lca = eval_df['lca'].mean()
    print(f"Avg LCA: {avg_lca}") 
    return avg_lca

calc_lca_df(jobs_ts)

Avg LCA: 0.5357142857142857


0.5357142857142857

In [10]:
# If want to apply Cross-encoder. Only jina seems to have a reranker which can take long context lengths
# Tried: jinaai/jina-reranker-v1-turbo-en, jinaai/jina-reranker-v2-base-multilingual, Alibaba-NLP/gte-multilingual-reranker-base
tqdm.pandas(desc='Cross-encoding in progress')
model = CrossEncoder("jinaai/jina-reranker-v2-base-multilingual",
    automodel_args={"torch_dtype": "auto"}, 
    trust_remote_code=True)

# The description is the query
reranked_isco_codes = []
for idx, query in tqdm(enumerate(jobs_ts['title_desc_ts_postclean_regex']), 
                       total=len(jobs_ts['title_desc_ts_postclean_regex'])):
    # rerank the results with original query and documents returned from Chroma
    sentence_pairs = [(query, doc) for doc in results["documents"][idx]]
    scores = model.predict(sentence_pairs, convert_to_tensor=True).tolist()
    # get the isco of the highest scoring document
    reranked_isco_codes.append(results["ids"][idx][np.argmax(scores)])

  0%|          | 0/42 [00:00<?, ?it/s]

In [11]:
# check LCA of reranked dataset
jobs_ts_reranked = jobs_ts.copy()
jobs_ts_reranked['isco_code'] = reranked_isco_codes
calc_lca_df(jobs_ts_reranked)

Avg LCA: 0.7083333333333334


0.7083333333333334

In [19]:
# Check gold standard dataset
gs_reranked = jobs_ts_reranked[jobs_ts_reranked['id'].astype(str).isin(best_isco_dict.keys())].copy()
gs_reranked['manual_isco'] = gs_reranked['id'].astype(str).map(best_isco_dict)

NameError: name 'jobs_ts_reranked' is not defined

In [17]:
# Prepare submission
jobs_ts[['id', 'isco_code']].to_csv('../submission/classification.csv', index=False, header=False)

In [None]:
# If want to create new vector database
client.delete_collection(name="isco")