This script attempts to clean the jobs data further using regex, as well as an embedding model to perform similarity search to extract parts of a job description which are relevant to ISCO classification i.e. tasks and skills

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.notebook import tqdm
import re
import nltk

nltk.download('punkt_tab')

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/hydraze/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Set file paths: Input is output of 1.data_processing.ipynb
jobs_input_path = '../output/jobs_ts.csv'
jobs_output_path = '../output/jobs_ts_cleaned_simsearch_regex.csv'

In [3]:
# Load data
jobs_ts = pd.read_csv(jobs_input_path)

In [4]:
# Split into sentences for each jd, facilitates cleaning downstream
jobs_ts['title_desc_ts_clean_sent_tok'] = [nltk.tokenize.sent_tokenize(td) for td in jobs_ts['title_desc_ts_postclean']]

In [5]:
# Filter out sentences which contain patterns unrelated to a job description. These sentences tend to be with respect to PDPA, or are 
# text belonging to the website which the JDs were scraped from e.g. "log in here", or "forgot password", which were likely to be buttons 
# on websites. 
def is_match(regex, text):
    pattern = re.compile(regex)
    return pattern.search(text) is not None

# Compile keywords for removal
regex = r"personal data|discriminat.*|i agree home|privacy policy|cookie|confirmation link|\slogin\s|log in|\scv\s|curriculum vitae|recommended browser|job alert|forgot password|t&c|job opportunities|sign up|receive notification|job ads|click here|resume"

all_cleaned_sent_tok_list = [] # To store all rows of data
removed_sent_list = [] # If want to perform analysis on what is being removed
for sent_tokenised in jobs_ts['title_desc_ts_clean_sent_tok']:
    cleaned_sent_tok_list = [] # List to store cleaned sentences for each row of data
    
    # For each sentence, keep them if they are not matching the keywords above
    for sent in sent_tokenised:
        if not is_match(regex, sent) and len(sent) > 2: # Ensure that empty spaces, and single letter/character splits are removed
            sent = sent.strip()
            cleaned_sent_tok_list.append(sent)
        else:
            removed_sent_list.append(sent)
            
    all_cleaned_sent_tok_list.append(cleaned_sent_tok_list)

jobs_ts['title_desc_ts_clean_sent_tok_regex'] = all_cleaned_sent_tok_list

In [6]:
# fill in with the pre-regex tokenised sentence if everything was removed by the regex filtering above
# sometimes the JDs go on and on without any full stops and form one long sentence, and the keywords above are part of it. 
jd_removed_bool = (jobs_ts['title_desc_ts_clean_sent_tok_regex'].apply(len) == 0) # Check which ones are empty lists. If there is at least one sentence which passes the filter, it would have a length of 1
jobs_ts.loc[jd_removed_bool, 'title_desc_ts_clean_sent_tok_regex'] = jobs_ts.loc[jd_removed_bool, 'title_desc_ts_clean_sent_tok']

jobs_ts['title_desc_ts_postclean_regex'] = jobs_ts['title_desc_ts_clean_sent_tok_regex'].apply(' '.join)

Cleaning sentences using a sentence transformer, continuing on from the previously regex-filtered tokenised sentences

The core idea is that if a sentence is not similar to the idea of tasks and skills (semantically at least), which is what the ISCO is based on, it should not be part of the JD. Similarity is based on cosine similarity.

In [7]:
# load model
tqdm.pandas(desc='Sentence searching in progress')

batch_size=50
similarity_threshold = 0.50 # tested on 42 sampled cases to remove a significant amount of noisy text
model = SentenceTransformer("BAAI/bge-small-en-v1.5", trust_remote_code=True) # performed reasonably well when looking at the text removed
hf_revision = "5c38ec7c405ec4b44b94cc5a9bb96e735b38267a" # version number of the bge sentence transformer

# this query encapsulates what the ISCO classification is based on, and each sentence in the JD will be compared against this query by calculating
# the cosine similarity. To only keep the sentences which have enough cosine similarity with the query.
query = "Job titles, professions, tasks and skills." 
embeddings_query = model.encode(query, device="cpu", batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True,
                                revision=hf_revision
                             )

# Same structure as filtering using regex above 
all_cleaned_sent_tok_list = [] 
all_removed_sent_list = []
for idx, sent_tokenised in tqdm(enumerate(jobs_ts['title_desc_ts_clean_sent_tok_regex']), total=len(jobs_ts['title_desc_ts_clean_sent_tok_regex'])):
    # convert the sentences into normalised embeddings
    embeddings_sent_tok = model.encode(sent_tokenised, device="cpu", batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True,
                                       revision=hf_revision)
    similarity =  embeddings_sent_tok @ embeddings_query.T # calculate cosine similarity score between embeddings (embeddings are already normalized)
    
    # Get only sentences where they are similar enough to the query i.e. the ISCO classification criteria
    inclusion_filter_bool = (similarity > similarity_threshold) 
    cleaned_sent_tok_list = list(np.array(sent_tokenised)[np.where(inclusion_filter_bool)]) 
    
    # Keeping the first sentence i.e. the job title, if it was removed by the similarity search (these tend to be job titles which are more unique e.g. packers, dock workers)
    if sent_tokenised[0] not in cleaned_sent_tok_list: 
        cleaned_sent_tok_list = [sent_tokenised[0]] + cleaned_sent_tok_list
    
    # Compile all the sentences which passed the similarity check
    all_cleaned_sent_tok_list.append(cleaned_sent_tok_list)
    
    # If want to check for which sentences are being removed
    removed_sent_list = list(np.array(sent_tokenised)[np.where(~inclusion_filter_bool)])
    all_removed_sent_list.append(removed_sent_list)
    
jobs_ts['title_desc_ts_clean_sent_tok_regex_simsearch'] = all_cleaned_sent_tok_list

# Join all tokenised sentences which passed the similarity check
jobs_ts['title_desc_ts_postclean_regex_simsearch'] = jobs_ts['title_desc_ts_clean_sent_tok_regex_simsearch'].apply(' '.join)


  0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
# fill in with the regex-cleaned tokenised sentence if everything was removed. 
jd_removed_bool = jobs_ts['title_desc_ts_postclean_regex_simsearch'].apply(len) == 0
print(sum(jd_removed_bool), 'rows had 0 sentences after applying regex + sim search cleaning')

jobs_ts.loc[jd_removed_bool, 'title_desc_ts_postclean_regex_simsearch'] = jobs_ts.loc[jd_removed_bool, 'title_desc_ts_postclean_regex']
assert min(jobs_ts['title_desc_ts_postclean_regex_simsearch'].apply(len)) > 0, "still have rows which have no values"

0 rows had 0 sentences after applying regex + sim search cleaning


In [9]:
# Output
jobs_ts.to_csv(jobs_output_path, index=False)