In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores.faiss import DistanceStrategy

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

import pandas as pd

In [None]:
# Load data
jobs_ts = pd.read_csv('../output/jobs_ts.csv')
isco = pd.read_csv('../data/wi_labels.csv', dtype=str)


In [None]:
# Some last minute cleaning
jobs_ts['title_desc_ts_postclean'] = jobs_ts['title_desc_ts_postclean'].str.lower()
isco['description'] = isco['description'].str.lower()

In [None]:
# Checking out notes/Some related occupations classified elsewhere” for ISCO
# These additional notes may end up saying how another classification would be better, and may confuse the rag
# so to remove them. Note, "notes" are always after "some related occupations..."
isco['description'] = (isco['description']
                       .str.replace(r'(notes\n.*)', '', regex=True)
                       .str.replace(r'(some related occupations classified elsewhere.*)', '', regex=True))


In [None]:
# Checking out notes/Some related occupations classified elsewhere” for ISCO
# These additional notes may end up saying how another classification would be better, and may confuse the rag
# so to remove them. Note, "notes" are always after "some related occupations..."
isco['description'] = (isco['description']
                       .str.replace(r'(notes\n.*)', '', regex=True)
                       .str.replace(r'(some related occupations classified elsewhere.*)', '', regex=True))


In [None]:
# # Creating gold standard based on the recommendations made in previous models
"""
The whole idea for this part is so that we can have a sensing of what could be some sources of error, and of course having
to evaluate quality of responses without having to create a submission since we only have 10 submission attempts
Evaluation below was done for baseline bge-large v1.5

Conclusion:
Employment agents and contractors (3333) comes up a lot, possibly because of all the miscellaneous/non-JD related text e.g. APPLY Save Follow us Site language Magyar English About us Download our mobile application Profession Offers for employers Imprint International services Have a question Customer service Contact Terms and Conditions Terms of use Privacy Policy Profession 2021 All Rights Reserved. Login To save a job To save a job, enter your e-mail address or log in to your profile Forgot your password Login I will send it to myself Login with Facebook Login with Google account
keyword based approach may be useful and augmented onto this step
Romanian/bulgarian/lithuanian job descriptions seem quite problematic. A lot of misc stuff from websites/about HR company who's managing the job post. To confirm if this is the case

Need to remove "Some related occupations classified elsewhere" for isco dataset, without removing the notes after it. (Or may to consider removing the notes too, need to check first)

* 897858836 and 771344098 were not graded as there were 0 hints on the type of jobs they were -- just gibberish
"""

# Filtering just to the sampled, and gold standard isco codes
best_isco_dict = {"872828466":"7132",
                  "839465958":"2330", # is grade 7-9 secondary school? or high school. 2320 seems plausible too
                  "857077872":"2212", # consultants = specialists. maybe it doesn't understand that
                  "804595650":"2153", # Best guess, but JD was not descriptive at all
                  "785637891":"2634", # Not very descriptive JD
                  "843263945":"5223",
                  "822053239":"7523", # uninformative JD, text looks like buttons from a website
                  "823300143":"3334", # Not very informative, but highlights real estate multiple times
                  "834972267":"8322", # JD says car or bus driver. so 8322 or 8331 could be correct.
                  "793505712":"2514",
                  "766940573":"5113",
                  "830014473":"5223", # Was none of the top 3. 2434 (top label) is wholesale, not retail
                  "792868203":"9629", # Not descriptive at all. 
                  "844673659":"8211", # Not very descriptive, ambiguous JD
                  "824883616":"3434", # Literally no description
                  "891827176":"2431", # Big differences between the numbers. But it's more admin than the ISCO job description. maybe there will be a better suggestion next
                  "753741043":"7322",
                  "776400501":"8142", # Chemicals  vs plastics? Confused? maybe need keyword search?
                  "789188547":"2262",
                  "848225359":"1345", # Big differences between the numbers
                  "879064773":"9111", # 9111, hard to say which one. 9111 is the first pick tho, although 5152 seems slightly more relevant. It depends on the skill level, which is not quite stated in the JD
                  "775359643":"2433", # None of the 3 options were good. Too much focus on the business
                  "810131612":"7221",
                  "802436242":"7411", # No description of job
                  "754636130":"3321",
                  "779195013":"5131", # Very messy JD, has a lot of other jobs combined into 1.Mentions many diff countries, so maybe that's why 4221 (Travel consultants and clerks) came up tops
                  "799557704":"3112", # JD was different from the job title. That being said, civil engineering was mentioned quite a bit, in terms of buildings and blueprints etc
                  "844474920":"9412", # Only had job title!! the whole description was about the HR company. The only hint was "unskilled"
                  "872195174":"2151", # Non-descriptive, had to rely on isco hierarchy, seemed like a proper engineer instead of a technician
                  "836929936":"8152", # Close fight between 8152/8153, but 8152 seems all emcompassing
                  "887356686":"9333", # Totally wrong. Perhaps due to a technical terminology e.g. dock, and CACES
                  "877325404":"4222",
                  "794441431":"7111", # Not very descriptive. Not likely a supervisory role, hence 7111 instead of 3123
                  "891808036":"9321", # Only have job title
                  "847941059":"5322",
                  "786885741":"2431", # No tasks at all, only talks about the company. 2431 was the closest
                  "873295233":"3322",
                  "845150846":"2152",
                  "779705693":"9412",
                  "797628595":"9112", # 9129 might be better than 9112 due to it mentioning usage of machinery, which is slightly more special. Stick to 9112 for now
                  "828908699":"2141", # Was tough finding a job description for QA/QC. Maybe a better classification will come along
                  "863016564":"5244" # Most likely, but non-descriptive JD too.                 
                  
                  } 

jobs_ts = jobs_ts[jobs_ts['id'].astype(str).isin(best_isco_dict.keys())].copy()

In [None]:
# Embeddings
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cpu",}
encode_kwargs = {"normalize_embeddings": False}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, 
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs
)

In [None]:
# Setting up lists for later
doc_list = isco['description'].to_list()
code_dict_list = [{"code":code} for code in isco['code']]

In [None]:
# initialize the bm25 retriever and faiss retriever

# Initialize the stemmer and stop words list
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

k=10
def bm25_preproc_function(text):
   return ' '.join([stemmer.stem(word) for word in word_tokenize(text) if word.isalnum() and word not in stop_words])

bm25_retriever = BM25Retriever.from_texts(doc_list, 
                                 #         preprocess_func=bm25_preproc_function,
                                          metadatas = code_dict_list
                                          ) # Needs lowercasing, stopword removal, stemming.. to
bm25_retriever.k = k

faiss_vectorstore = FAISS.from_texts(doc_list, 
                                     hf, 
                                     distance_strategy=DistanceStrategy.COSINE,
                                     metadatas = code_dict_list)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": k})

In [None]:
def calc_lca(row):
    isco1 = row['manual_isco']
    isco2 = row['isco_code']
    print(isco1)
    print(isco2)
    if isco1[0] != isco2[0]:
        return 0
    elif isco1[1] != isco2[1]:
        return 0.25
    elif isco1[2] != isco2[2]:
        return 0.5
    elif isco1[3] != isco2[3]:
        return 0.75
    else:
        return 1

def calc_lca_df(df):
    eval_df = df[df['id'].astype(str).isin(best_isco_dict.keys())].copy()
    eval_df['manual_isco'] = eval_df['id'].astype(str).map(best_isco_dict)
    eval_df['lca'] = eval_df.apply(calc_lca, axis=1)
    avg_lca = eval_df['lca'].mean()
    print(f"Avg LCA: {avg_lca}") 
    return avg_lca



In [None]:
# initialize the ensemble retriever
faiss_wts = np.arange(0, 1 + 0.1, 0.1)
lca_score_list = []
for faiss_wt in faiss_wts:
    print('FAISS Weight: ', faiss_wt)
    # Set up hybrid retriever
    ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
                                           weights=[1-faiss_wt, faiss_wt])
    
    # For each description, get relevant documents, take the first ranked document, and also the code 
    top_doc_list = []
    top_code_list = []
    for desc in jobs_ts['title_desc_ts_postclean']:
        docs = ensemble_retriever.get_relevant_documents(desc)
        top_doc_list.append(docs[0])
        top_code_list.append(docs[0].metadata['code'])
        
    # Calculate
    jobs_ts['isco_code'] = top_code_list
    lca_score_list.append(calc_lca_df(jobs_ts))
        

In [None]:
# Plot: Hybrid doesn't seem to help in this case
import matplotlib.pyplot as plt
plt.plot(faiss_wts, lca_score_list)