# Indexing and preparing a collection of SHL assessments (from a JSON/CSV file) using the BM25 algorithm

In [1]:
import json
from rank_bm25 import BM25Okapi

# 1. Load assessments JSON
with open("shl_assessments.json", encoding="utf-8") as f:
    assessments = json.load(f)

# 2. Prepare “documents” for BM25: name + description + test_type
docs = []
for a in assessments:
    parts = [a["name"]]
    if desc := a.get("description"):
        parts.append(desc)
    if tt := a.get("test_type"):
        parts += (tt if isinstance(tt, list) else [tt])
    docs.append(" ".join(parts).lower())

# 3. Tokenize and initialize BM25
tokenized = [doc.split() for doc in docs]
bm25 = BM25Okapi(tokenized)

print(f"Indexed {len(assessments)} assessments with BM25.")


Indexed 230 assessments with BM25.


# combining traditional keyword-based search (BM25) with semantic similarity using Sentence Transformers for more accurate, context-aware ranking.

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# ——— Initialize once at startup ———
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_texts(texts: list[str]) -> np.ndarray:
    """
    Embed a list of texts using a local SentenceTransformer.
    Returns an (N × D) array of embeddings.
    """
    # convert_to_numpy=True gives you an np.ndarray
    return embed_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

# ——— = query = ———
user_query = "job desc"

#BM25 shortlist
query_tokens = user_query.lower().split()
bm25_scores = bm25.get_scores(query_tokens)

K = 20
top_idx = np.argpartition(-bm25_scores, K)[:K]
top_idx = top_idx[np.argsort(-bm25_scores[top_idx])]
shortlist = [assessments[i] for i in top_idx]

# 2) Embed the user query
query_embedding = embed_texts([user_query])[0]  # shape: (D,)

# 3) Prepare and embed each candidate
cand_texts = [
    " ".join([c["name"]] + (c.get("test_type") or []))
    for c in shortlist
]
candidate_embeddings = embed_texts(cand_texts)  # shape: (K, D)

print(f"Got query embedding of size {query_embedding.shape} and {candidate_embeddings.shape[0]} candidate embeddings.")


Got query embedding of size (384,) and 20 candidate embeddings.


#  hybrid semantic search system that returns the top N most relevant SHL assessments for a user query.

In [13]:
import numpy as np

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    """Compute cosine similarity between two 1-D arrays."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# ——— Given from Step 3 ———
#   user_query
#   shortlist: list of assessment-dicts (length K)
#   query_embedding: np.ndarray, shape (D,)
#   candidate_embeddings: np.ndarray, shape (K, D)

N = 3  # final number of recommendations

# 1) Compute similarity scores
scores = [cosine_sim(query_embedding, emb) for emb in candidate_embeddings]

# 2) Get top-N indices
top_n_idx = np.argsort(scores)[-N:][::-1]  # descending order

# 3) Build final recs
final_recs = []
for idx in top_n_idx:
    a = shortlist[idx]
    final_recs.append({
        "name": a["name"],
        "url": a["url"],
        "score": scores[idx],
        "duration": a.get("duration"),
        "remote_testing": a.get("remote_testing", False),
        "adaptive_irt": a.get("adaptive_irt", False),
        "test_type": a.get("test_type", []),
    })

# 4) Output
for i, rec in enumerate(final_recs, 1):
    print(f"{i}. {rec['name']} (score: {rec['score']:.3f})")
    print(f"   URL: {rec['url']}")
    if rec["duration"] is not None:
        print(f"   Duration: {rec['duration']} min")
    print(f"   Remote: {rec['remote_testing']}, Adaptive: {rec['adaptive_irt']}")
    print(f"   Types: {rec['test_type']}\n")


1. Supervisor - Short Form (score: 0.293)
   URL: https://www.shl.com/products/product-catalog/view/supervisor-short-form/
   Duration: 1 min
   Remote: False, Adaptive: False
   Types: ['Simulations']

2. Support Supervisor Solution (score: 0.210)
   URL: https://www.shl.com/products/product-catalog/view/support-supervisor-solution/
   Duration: 1 min
   Remote: False, Adaptive: False
   Types: ['Ability & Aptitude', 'Personality & Behavior', 'Simulations', 'Biodata & Situational Judgement']

3. Supervisor 7.0 Solution (score: 0.176)
   URL: https://www.shl.com/products/product-catalog/view/supervisor-7-0-solution/
   Duration: 1 min
   Remote: False, Adaptive: False
   Types: ['Biodata & Situational Judgement', 'Competencies']



# scraping and extracting the job description text from a given job posting URL, using requests and BeautifulSoup

In [2]:
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        soup = BeautifulSoup(response.text, 'html.parser')

        # Look for the HTML element that contains the job description
        # You may need to adjust these based on the platform
        job_desc = soup.find('div', {'class': 'job-description'})  # Example for a common class
        
        if job_desc:
            return job_desc.get_text(strip=True)
        else:
            return "Job description not found."

    except requests.exceptions.RequestException as e:
        return f"Error fetching the URL: {e}"

# Test with a valid URL
job_url = "https://www.linkedin.com/jobs/view/research-engineer-ai-at-shl-4194768899/?originalSubdomain=in"
job_desc = extract_text_from_url(job_url)
print(job_desc)


Job description not found.


In [5]:
import httpx
import logging
import re
from urllib.parse import urlparse
from bs4 import BeautifulSoup

# Set up logger
logger = logging.getLogger(__name__)

def extract_text_from_url(url: str) -> str:
    """Extract text content from a URL using BeautifulSoup"""
    try:
        # Validate URL
        parsed_url = urlparse(url)
        if not parsed_url.scheme or not parsed_url.netloc:
            print("Invalid URL. Please enter a valid URL.")
            return ""
        
        # Fetch content
        with httpx.Client(timeout=10.0) as client:
            response = client.get(url)
            response.raise_for_status()
            
            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)
            
            # Limit length for preview
            return text[:5000]
            
    except Exception as e:
        logger.error(f"Error extracting text from URL: {str(e)}")
        print(f"Error extracting text from URL: {str(e)}")
        return ""

# Test the function with the provided URL
url = "https://www.linkedin.com/jobs/view/research-engineer-ai-at-shl-4194768899/?originalSubdomain=in"
extracted_text = extract_text_from_url(url)
print(extracted_text)


SHL hiring Research Engineer, AI in Gurgaon, Haryana, India | LinkedIn Skip to main content LinkedIn Research Engineer, AI in Guwahati Expand search This button displays the currently selected search type. When expanded it provides a list of search options that will switch the search inputs to match the current selection. Jobs People Learning Clear text Clear text Clear text Clear text Clear text Join now Sign in Research Engineer, AI SHL Gurgaon, Haryana, India Research Engineer, AI SHL Gurgaon, Haryana, India 2 weeks ago Be among the first 25 applicants See who SHL has hired for this role Report this job Use AI to assess how you fit Get AI-powered advice on this job and more exclusive features. Am I a good fit for this job? Tailor my resume Sign in to access AI-powered advices Sign in Welcome back Email or phone Password Show Forgot password? Sign in or By clicking Continue to join or sign in, you agree to LinkedIn’s User Agreement , Privacy Policy , and Cookie Policy . New to Linked