In [1]:
import json
import re
import os
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss

In [2]:
# Load the JSON
with open("SHLAssessmentDataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [3]:
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,assessment_name,assessment_url,pdf_text
0,.NET MVC (New),https://www.shl.com/products/product-catalog/v...,.NET MVC\nAssessment Fact Sheet\nOverview\nMul...
1,.NET MVVM (New),https://www.shl.com/products/product-catalog/v...,.NET MVVM\nAssessment Fact Sheet\nOverview\nMu...
2,.NET WCF (New),https://www.shl.com/products/product-catalog/v...,.NET WCF\nAssessment Fact Sheet\nOverview\nMul...


In [4]:
df.loc[0, "pdf_text"]

'.NET MVC\nAssessment Fact Sheet\nOverview\nMulti-choice test that measures the knowledge of Model-View-Controller (MVC) architecture,\nvalidation, security, routing, and areas.\nRelevant Job Roles\n.Net Developer, Software Developer, Software Engineer,\nApplication Developer, Technical Architect, Full Stack\nDeveloper\nDetails\nLanguage\nEnglish (US)\nAverage Testing Time (minutes)\n17\nminutes\nAllowed Time (minutes)\n30\nminutes\nMaximum Number of Questions\n20\nquestions\nNumber of Sittings\nOne\nTest Type\nMultiple Choice - CTT\nSector\nInformation Technology\nScores Reported\n• Overall Score\n• MVC Architecture\n• Models, Views and Controllers\n• Routing and Areas\n• Validation and Security\nO*NET Competency\nProgramming\nKnowledge, Skills,\nAbilities And\nCompetencies\nMeasured\nThe following areas are covered:\n• MVC architecture\n• MVC life cycle\n• Inversion of control and dependency injection\n• Folder structure\n• Controllers, actions, and ﬁlters\n• Coupled and de-coupled c

In [5]:
#rename pdf_text to assess_describe and in that changing "\n" in the whole string to " "
df = df.rename(columns={"pdf_text": "assess_describe"})
df["assess_describe"] = df["assess_describe"].astype(str).str.replace(r"\n", " ", regex=True)

print(df.loc[0, "assess_describe"])


.NET MVC Assessment Fact Sheet Overview Multi-choice test that measures the knowledge of Model-View-Controller (MVC) architecture, validation, security, routing, and areas. Relevant Job Roles .Net Developer, Software Developer, Software Engineer, Application Developer, Technical Architect, Full Stack Developer Details Language English (US) Average Testing Time (minutes) 17 minutes Allowed Time (minutes) 30 minutes Maximum Number of Questions 20 questions Number of Sittings One Test Type Multiple Choice - CTT Sector Information Technology Scores Reported • Overall Score • MVC Architecture • Models, Views and Controllers • Routing and Areas • Validation and Security O*NET Competency Programming Knowledge, Skills, Abilities And Competencies Measured The following areas are covered: • MVC architecture • MVC life cycle • Inversion of control and dependency injection • Folder structure • Controllers, actions, and ﬁlters • Coupled and de-coupled classes • Data models and model binding • Views

In [6]:
df.head(3)

Unnamed: 0,assessment_name,assessment_url,assess_describe
0,.NET MVC (New),https://www.shl.com/products/product-catalog/v...,.NET MVC Assessment Fact Sheet Overview Multi-...
1,.NET MVVM (New),https://www.shl.com/products/product-catalog/v...,.NET MVVM Assessment Fact Sheet Overview Multi...
2,.NET WCF (New),https://www.shl.com/products/product-catalog/v...,.NET WCF Assessment Fact Sheet Overview Multi-...


In [7]:
df["assess_describe"] = (df["assess_describe"].fillna("").astype(str).str.replace(r"\s+", " ", regex=True).str.strip())
df["assessment_name"] = df["assessment_name"].fillna("").astype(str).str.strip()
df["assessment_url"] = df["assessment_url"].fillna("").astype(str).str.strip()
# Drop empty rows (optional)
df = df[df["assess_describe"].str.len() > 0].reset_index(drop=True)

In [8]:
# Load ST model (CPU)
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME, device="cpu")

# Build doc strings (one vector per assessment)
docs = (df["assessment_name"] + " — " + df["assess_describe"]).tolist()

# Encode -> numpy float32, cosine-ready
X = model.encode(
    docs,
    batch_size=128,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,  # important: enables cosine via inner product
).astype("float32")

# FAISS index (Inner Product = cosine since normalized)
index = faiss.IndexFlatIP(X.shape[1])
index.add(X)

TOP_K = 7

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
TOP_K = 7  # number of recommendations to return

def search(query: str, top_k: int = TOP_K):
    # Encode query -> float32, normalized
    q = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    scores, idxs = index.search(q, top_k)

    rows = []
    for rank, (i, s) in enumerate(zip(idxs[0], scores[0]), 1):
        rec = df.iloc[int(i)]
        rows.append({
            "rank": rank,
            "similarity": float(s),
            "assessment_name": rec["assessment_name"],
            "assessment_url": rec["assessment_url"],
            "assess_snippet": rec["assess_describe"][:400] + ("…" if len(rec["assess_describe"]) > 400 else "")
        })
    return pd.DataFrame(rows)



In [10]:
# Corrected code using triple quotes for the multi-line string
s1 = search("Give me a C++ Programming assessment exam of time about 50 mins.", top_k=TOP_K)
s2 = search("Give me a C++ assessment exam of time about 50 mins.", top_k=TOP_K)
s3 = search("Content Writer required, expert in English and SEO.", top_k=TOP_K)
s4 = search("junior python programming assessment with OOP and exceptions, timed, and asessment should allow more than 15 minutes to complete", top_k=TOP_K)

In [11]:
pd.set_option('display.max_colwidth', None)
s3["assessment_url"].head(5)

0               https://www.shl.com/products/product-catalog/view/search-engine-optimization-new/
1                             https://www.shl.com/products/product-catalog/view/social-media-new/
2          https://www.shl.com/products/product-catalog/view/writex-email-writing-managerial-new/
3    https://www.shl.com/products/product-catalog/view/writex-email-writing-customer-service-new/
4               https://www.shl.com/products/product-catalog/view/writex-email-writing-sales-new/
Name: assessment_url, dtype: object

In [12]:


import re

def boosted_search(query: str,
                   top_k: int = TOP_K,
                   base_fetch: int = 30,
                   name_boost: float = 0.15,
                   url_boost: float = 0.05):
    # Base retrieve more
    q = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    scores, idxs = index.search(q, base_fetch)

    base = pd.DataFrame({"idx": idxs[0], "score": scores[0]})
    base["assessment_name"] = df.iloc[base["idx"]]["assessment_name"].values
    base["assessment_url"]  = df.iloc[base["idx"]]["assessment_url"].values
    base["assess_describe"] = df.iloc[base["idx"]]["assess_describe"].values

    tokens = re.findall(r"\w+", query.lower())
    if tokens:
        pat = "|".join(map(re.escape, dict.fromkeys(tokens)))
        name_hit = base["assessment_name"].str.lower().str.contains(pat, regex=True, na=False)
        url_hit  = base["assessment_url"].str.lower().str.contains(pat, regex=True, na=False)
    else:
        name_hit = pd.Series(False, index=base.index)
        url_hit  = pd.Series(False, index=base.index)

    base["boosted"] = base["score"] + name_hit.astype(float) * name_boost + url_hit.astype(float) * url_boost
    base = base.sort_values("boosted", ascending=False).head(top_k)

    out = []
    for rank, row in enumerate(base.itertuples(index=False), 1):
        snippet = row.assess_describe[:400] + ("…" if len(row.assess_describe) > 400 else "")
        out.append({
            "rank": rank,
            "similarity": float(row.boosted),
            "assessment_name": row.assessment_name,
            "assessment_url": row.assessment_url,
            "assess_snippet": snippet
        })
    return pd.DataFrame(out)



In [13]:
def recommend(query: str, k: int = 7, boosted: bool = True):
    df_res = boosted_search(query, top_k=k) if boosted else search(query, top_k=k)
    # Pretty print
    display_cols = ["rank", "assessment_name", "assessment_url", "similarity", "assess_snippet"]
    return df_res[display_cols]



In [24]:
query = "I want to hire a Senior Data Analyst with 5 years of experience and expertise in SQL, Excel and Python. The assessment can be 1-2 hour long"
recs = recommend(query, k=7, boosted=True)
recs.to_json("sample_recs.json", orient="records", indent=2, force_ascii=False)
recs


Unnamed: 0,rank,assessment_name,assessment_url,similarity,assess_snippet
0,1,Data Science (New),https://www.shl.com/products/product-catalog/view/data-science-new/,0.661852,"Data Science Assessment Fact Sheet Overview Multi-choice test that measures the conceptual knowledge on how to use machine learning to analyze data, extract information, draw conclusions and make statistically-driven decisions. Relevant Job Roles Data Scientists, Analyst - Data Science Details Language English (US) Average Testing Time (minutes) 14 minutes Allowed Time (minutes) 20 minutes Maximum…"
1,2,Salesforce Development (New),https://www.shl.com/products/product-catalog/view/salesforce-development-new/,0.64011,"Salesforce Development Assessment Fact Sheet Overview Multi-choice test that measures the knowledge of Salesforce platform, design and data models, business logic, data management and analytics. Relevant Job Roles Application Developer - Salesforce, Salesforce Developer Details Language English (US) Average Testing Time (minutes) 14 minutes Allowed Time (minutes) 20 minutes Maximum Number of Quest…"
2,3,Statistical Analysis System (New),https://www.shl.com/products/product-catalog/view/statistical-analysis-system-new/,0.638557,"Statistical Analysis System Assessment Fact Sheet Overview Multi-choice test that measures the ability to use the SAS software for statistical analysis. Relevant Job Roles Business Analyst, Data Analyst, Statistician, Psychometric Consultant, Quantitative Methodologist Details Language English (US) Average Testing Time (minutes) 11 minutes Allowed Time (minutes) 15 minutes Maximum Number of Questi…"
3,4,Data Warehousing Concepts,https://www.shl.com/products/product-catalog/view/data-warehousing-concepts/,0.637678,Data Warehousing Concepts Assessment Fact Sheet Overview The Data Warehousing Concepts test measures knowledge of Data Warehousing. Designed for experienced users. Job Family/Title Database Administrators Details Average Testing Time (minutes) 25 minutes Allowed Time (minutes) 90 minutes Maximum Number of Questions 30 questions Number of Sittings One Designed for Unproctored Environment Yes Questi…
4,5,Tableau (New),https://www.shl.com/products/product-catalog/view/tableau-new/,0.636397,"Tableau Assessment Fact Sheet Overview Multi-choice test that measures the knowledge of how to use Tableau to prepare tables, create visualizations, perform calculations, apply ﬁlters and carry out forecasting. Relevant Job Roles Tableau Developer, Application Developer - Tableau, Data Analyst - Tableau, Tableau Engineer, Business Analyst Details Language English (US) Average Testing Time (minutes…"
5,6,Data Entry (New),https://www.shl.com/products/product-catalog/view/data-entry-new/,0.621819,"Data Entry Assessment Fact Sheet Overview Simulated data entry test that measures the ability to accurately transcribe data from pre-ﬁlled forms and the ability to verify pre-ﬁlled data. Relevant Job Roles Data Entry Operator, Data Entry Keyer, Customer Service Representative, Chat Executive Details Language English (US) Average Testing Time (minutes) 04 minutes Allowed Time (minutes) 08 minutes M…"
6,7,MS Excel (New),https://www.shl.com/products/product-catalog/view/ms-excel-new/,0.620969,"MS Excel Assessment Fact Sheet Overview Multi-choice test that measures the ability to use MS Excel to maintain, organize, analyze and present numeric data. Relevant Job Roles Administrative Services Manager, Sales Manager, General and Operations Manager, Marketing Manager, Business Analyst, Consultant Details Language English (US) Average Testing Time (minutes) 06 minutes Allowed Time (minutes) 1…"


#### Getting the Similarity

In [None]:
import json, re, time
from pathlib import Path
from typing import Dict, List, Any, Tuple
import pandas as pd

# ---------- URL helpers ----------
def _normalize_url(u: str) -> str:
    if not isinstance(u, str): return ""
    s = u.strip().lower().replace("https://", "").replace("http://", "")
    if s.startswith("www."): s = s[4:]
    s = re.sub(r"[#?].*$", "", s)
    if s.endswith("/"): s = s[:-1]
    return s

def _slug(u: str) -> str:
    u = _normalize_url(u)
    m = re.search(r"/view/([^/?#]+)", u)
    if m: return m.group(1)
    parts = [p for p in u.split("/") if p]
    return parts[-1] if parts else u

# ---------- core evaluator ----------
def evaluate_recall_from_txt(
    file_path: str,
    K: int = 10,
    boosted: bool = True,
    sleep_sec: float = 0.0,
) -> Dict[str, float]:
    """
    Evaluate Recall@K for each query in Train_dataset.txt (JSON array of dicts),
    using your recommend(query, k=K, boosted=boosted) function.

    Returns: dict like { "<query1>": 0.6, "<query2>": 0.8, ..., "Mean_Recall": 0.7 }
    """
    data = json.loads(Path(file_path).read_text(encoding="utf-8"))
    if not isinstance(data, list):
        raise ValueError("Train_dataset.txt must contain a JSON array of objects.")

    per_query: Dict[str, float] = {}
    n = len(data)

    for i, item in enumerate(data, start=1):
        q = item.get("Query", "")
        gt_urls: List[str] = item.get("assessment_url", []) or []
        gt_slugs = {_slug(u) for u in gt_urls if u}

        # Call your recommender exactly as specified
        try:
            df_res = recommend(q, k=K, boosted=boosted)
        except Exception as e:
            # If something goes wrong, record 0 for this query and continue
            per_query[q] = 0.0
            if sleep_sec and i < n:
                time.sleep(sleep_sec)
            continue

        # Extract predicted URLs from the returned DataFrame
        pred_urls: List[str] = []
        if isinstance(df_res, pd.DataFrame):
            if "assessment_url" in df_res.columns:
                pred_urls = df_res["assessment_url"].astype(str).head(K).tolist()
            else:
                # Try common fallbacks just in case
                for cand in ["url", "URL", "link", "href"]:
                    if cand in df_res.columns:
                        pred_urls = df_res[cand].astype(str).head(K).tolist()
                        break

        pred_slugs = [_slug(u) for u in pred_urls if u]

        # Compute Recall@K
        if gt_slugs:
            hits = len(set(pred_slugs[:K]) & gt_slugs)
            recall_k = hits / len(gt_slugs)
        else:
            recall_k = 0.0  # no labels → define as 0.0 (or skip, your choice)

        per_query[q] = round(recall_k, 4)

        # Respect optional delay between queries (e.g., if calling external APIs upstream)
        if sleep_sec and i < n:
            time.sleep(sleep_sec)

    # Mean recall across all evaluated queries
    mean_recall = round(sum(per_query.values()) / len(per_query), 4) if per_query else 0.0
    per_query["Mean_Recall"] = mean_recall
    return per_query



In [23]:

results = evaluate_recall_from_txt(r"Train_Dataset_SHL.txt", K=10, boosted=True, sleep_sec=0)
print(json.dumps(results, ensure_ascii=False, indent=2))

{
  "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.": 0.6,
  "I am looking for a COO for my company in China and I want to see if they are culturally a right fit for our company. Suggest me an assessment that they can complete in about an hour": 0.0,
  "KEY RESPONSIBITILES:\n\nManage the sound-scape of the station through appropriate creative and marketing interventions to Increase or Maintain the listenership\nActs as an interface between Programming & sales team, thereby supporting the sales team by providing creative inputs in order to increase the overall ad spends by clients\nBuild brand Mirchi by ideating fresh programming initiatives on air campaigns, programming led on-ground events & new properties to ensure brand differentiation & thus increase brand recall at station level\nInvest time in local RJs to grow & develop them as local celebrities\nThrough strong network