In [1]:
import bm25s
import Stemmer  # optional: for stemming

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
# Load data
jobs_ts = pd.read_csv('../output/jobs_ts.csv')
isco = pd.read_csv('../data/wi_labels.csv', dtype=str)


In [3]:
# Some last minute cleaning
jobs_ts['title_desc_ts_postclean'] = jobs_ts['title_desc_ts_postclean'].str.lower()
isco['description'] = isco['description'].str.lower()

In [4]:
# Checking out notes/Some related occupations classified elsewhere” for ISCO
# These additional notes may end up saying how another classification would be better, and may confuse the rag
# so to remove them. Note, "notes" are always after "some related occupations..."
isco['description'] = (isco['description']
                       .str.replace(r'(notes\n.*)', '', regex=True)
                       .str.replace(r'(some related occupations classified elsewhere.*)', '', regex=True))


In [5]:
# Create your corpus here
corpus = isco['description'].to_list()

# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)


Split strings:   0%|          | 0/436 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/436 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/436 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/436 [00:00<?, ?it/s]

In [9]:
# Query the corpus
query = jobs_ts['title_desc_ts_postclean'][2]
print('query: ', query)
query_tokens = bm25s.tokenize(query, stemmer=stemmer)

# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=5)

for i in range(results.shape[1]):
    doc, score = results[0, i], scores[0, i]
    print(f"Rank {i+1} (score: {score:.2f}): {doc}")


query:  consultants in emergency medicine - doughiska. the galway clinic is a leading 146 bed, state of the art independent hospital situated on the outskirts of galway, ireland and is an affiliated teaching hospital to the royal college of surgeons in ireland (rcsi) and national university of ireland galway (nuig). there is a 4-bed coronary care unit and 8-bed intensive care/high dependency unit. we are looking to expand our consultant team to cover our busy emergency department service. consultants in emergency medicine the clinic is aiming to recruit full-time or part-time consultants who have completed the appropriate specialist training in emergency medicine and are on the specialist division of the register of the medical council in ireland, or are eligible to be so. the primary responsibility will be working as part of a team of consultants providing patient care within the galway clinics emergency room, which presently opens 10am to 6pm weekdays and 10am to 5pm at weekends. the

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Rank 1 (score: 38.77): nursing associate professionals provide basic nursing and personal care for people in need of such care due to effects of ageing, illness, injury, or other physical or mental impairment. they generally work under the supervision of, and in support of, implementation of health care, treatment and referrals plans established by medical, nursing and other health professionals.
tasks include -
(a)  providing nursing and personal care and treatment and health advice to patients according to care plans established by health professionals; 
(b)  administering medications and other treatments to patients, monitoring patients’ condition and responses to treatment, and referring patients and their families to a health professional for specialized care as needed;
(c)  cleaning wounds and applying surgical dressings;
(d)  updating information on patients’ condition and treatments received in record-keeping systems;
(e)  assisting in planning and managing the care of individu