After cleaning the data, this script does a semantic search based on similarity score using an embedding model, as well as an additional layer of a cross-encoder which helps to perform reranking on a specified top-n number of documents. The cross-encoder is more accurate at predicting similarity scores, at the cost of being more computationally expensive and time consuming because it sends the input data through the entire network, instead of the last layer (which is what happens during inference when using an embedding model).

In this case, the queries are the job descriptions, and the documents to be retrieved are the ISCO descriptions + the corresponding codes.

The outputs of both script 1 and 2 are used in this script.

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import json

In [8]:
# Set file paths: Input is output of 1.data_processing.ipynb
jobs_input_path = '../output/jobs_ts_cleaned_simsearch_regex.csv'
isco_input_path = '../output/wi_labels_cleaned.csv'
compiled_topn_codes_path = '../output/compiled_topn_codes.json' # To output and load top n ISCO codes from the similarity search
compiled_topn_desc_path = '../output/compiled_topn_desc.json' # To output and load top n ISCO description from the similarity search
submission_path = '../submission/classification.csv' # Need a submission folder as well

In [3]:
# Load data
jobs_ts = pd.read_csv(jobs_input_path)
isco = pd.read_csv(isco_input_path, dtype=str)

# Initialise 
top_n = 5 # Set how many documents you want from initial similarity search filtering, before feeding the top n isco descriptions into the cross-encoder
jd_colname = "title_desc_ts_postclean_regex_simsearch" 
device = "mps" # For sentence transformers and cross-encoder, device to use. If not using a Macbook with a silicon chip, set this to "cuda" if a GPU is available, or "cpu" otherwise.

In [4]:
# create query (the jd) and document (isco) lists
jd = jobs_ts[jd_colname].to_list()
isco_desc = isco['description'].to_list()

Similarity search

In [None]:
# embedding JDs and isco descriptions, and calculating cosine similarity between each JD and all the isco desciptions
# Similarity is based on cosine similarity.
batch_size=32 # For how many text inputs to embed at the same time
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", trust_remote_code=True,
                            revision="526dc52cb738085d87002bf00ca4d3d99fd0029b") 

embeddings_jd = model.encode(jd, device=device, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True,
                             convert_to_numpy=True)
embeddings_isco_desc = model.encode(isco_desc, device=device, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True,
                                    convert_to_numpy=True
                                    )
similarity = embeddings_jd @ embeddings_isco_desc.T # Embeddings are already normalised, hence this calculates cosine similarity

In [6]:
# Get top n documents

# This generates the indices of each jd's similarity scores (against the isco descriptions) in ascending order 
# i.e. last value is the index of the ISCO description has the highest similarity to the given JD
indices = np.argsort(similarity)[:, -top_n:] 
scores = np.take_along_axis(similarity, indices, axis=-1)

# For each JD, get the top n codes and description based on the indexes obtained above
compiled_topn_codes = []
compiled_topn_desc = []
for idx in range(len(jd)):
    isco_code_topn = [isco['code'][top_idx] for top_idx in indices[idx].tolist()]
    isco_desc_topn = [isco['description'][top_idx] for top_idx in indices[idx].tolist()]
    compiled_topn_codes.append(isco_code_topn)
    compiled_topn_desc.append(isco_desc_topn)
    
jobs_ts['isco_code'] = [topn_codes[-1] for topn_codes in compiled_topn_codes]

In [9]:
# Save out top n descriptions and code. This is useful in case it is needed to break this script into two parts, if runtimes are too long or 
# the previous embedding steps took up too much RAM.
with open(compiled_topn_desc_path, 'w') as f:
    json.dump(compiled_topn_desc, f) 

with open(compiled_topn_codes_path, 'w') as f:
    json.dump(compiled_topn_codes, f) 

Reranking using cross-encoder: If the similarity search has been previously run in a separate runtime, minimally needs to run all cells before the "similarity search" section of this script before proceeding with this part. 

In [10]:
# Load top n descriptions and code again, if need to split the script into 2 runs
with open(compiled_topn_desc_path, 'r') as f:
    compiled_topn_desc = json.load(f)
    
with open(compiled_topn_codes_path, 'r') as f:
    compiled_topn_codes = json.load(f)

In [None]:
# Somehow this is the best way.. using sentence transformers led to constantly increasing ram usage, which slows things
# down considerably. ~ 8hrs
tqdm.pandas(desc='Cross-encoding in progress')
model = AutoModelForSequenceClassification.from_pretrained(
    'jinaai/jina-reranker-v2-base-multilingual',
    torch_dtype="auto",
    trust_remote_code=True,
    revision="126747772a932960028d9f4dc93bd5d9c4869be4"
    
)

model.to(device) 
model.eval()

# For each JD, to compare it to the top n most similar ISCO descriptions, and get the isco description which obtained the highest similarity score
reranked_isco_codes = []
for idx, query in tqdm(enumerate(jd), total=len(jd)):
    # Create sentence pairs for each JD and the top ISCO descriptions
    sentence_pairs = [(query, doc) for doc in compiled_topn_desc[idx]]
    scores = model.compute_score(sentence_pairs)
    # get the isco code of the highest scoring document, for submission
    reranked_isco_codes.append(compiled_topn_codes[idx][np.argmax(scores)])

jobs_ts['isco_code'] = reranked_isco_codes

In [9]:
# Prepare submission
jobs_ts[['id', 'isco_code']].to_csv(submission_path, index=False, header=False) 