#### Importing Librariries

In [None]:
# If needed once:
# !pip install -q pandas numpy sentence-transformers scikit-learn regex

import re, json, math, hashlib, time, importlib.util
from pathlib import Path
from typing import List, Dict, Tuple
import numpy as np
import pandas as pd


import unicodedata
from scipy import sparse

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize


#### Setting Prameters(weights, Model, etc)

In [69]:
from scipy import sparse


# ==== PATHS ====
DATA_JSON  = "NewSHLDataset.json"  
CACHE_DIR  = Path("./embedding_cache"); CACHE_DIR.mkdir(parents=True, exist_ok=True)

# ==== MODEL ====
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# ==== COLUMN NAMES (align to your JSON keys) ====
COL_PDF   = "pdf_text"
COL_LEVEL = "job_level"
COL_LANG  = "test_language"
COL_NAME  = "assessment_name"
COL_URL   = "assessment_url"

# RANKING WEIGHTS (semantic all steps; small lexical boost)
W_EMB_LOOK  = 0.65  # looking_for ↔ pdf_text
W_EMB_CONS  = 0.15  # constraints ↔ pdf_text
W_EMB_LANG  = 0.10  # language ↔ language
W_EMB_LEVEL = 0.10  # job_level ↔ job_level

W_TFIDF_LOOK = 0.20 # lexical boost for looking_for topic
W_TFIDF_CONS = 0.10 # lexical boost for constraints

# COVERAGE / OUTPUT
TOP_N         = 20
GUARANTEE_TOP = 4   # ensure coverage in top-4
K_PER_FACET   = 3   # from top-3 by facet, if missing in top-4, bring one up


#### Loading Assessments Dataset

The Dataset is extracted by crawling the SHL website and includes “individual test solutions” only, the dataset has columns assessment_name, assessment_url, pdf_text, job_level, test_type, test_language.
Extraction was done from Product fact Sheet pdf and web page of an assessment using PyMuPDf parser and HTML Parser. Further detail commented in Dataset_Extraction.py, the python file can be rerun to extract the data.

In the dataset, pdf_text stores all info from Product Fact Sheet, we will primarily use this data to understand the Assessment Product.

In [4]:
def load_json_catalog(path: str) -> pd.DataFrame:
    p = Path(path)
    assert p.exists(), f"File not found: {path}"
    # Try JSONL first (common for large dumps)
    try:
        rows = []
        with p.open("r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                line = line.strip()
                if not line: 
                    continue
                try:
                    rows.append(json.loads(line))
                except json.JSONDecodeError:
                    # Not JSONL; fall back to array parsing below
                    rows = None
                    break
        if rows is not None:
            return pd.DataFrame(rows)
    except Exception:
        pass
    # Fallback: full JSON array
    data = json.loads(Path(path).read_text(encoding="utf-8"))
    if isinstance(data, dict) and "items" in data:
        data = data["items"]
    assert isinstance(data, list), "Top-level JSON must be a list of objects"
    return pd.DataFrame(data)

df = load_json_catalog(DATA_JSON)
df = df.fillna("")
df = df.reset_index(drop=False).rename(columns={"index":"row_id"})
print(df.shape)
df.head(2)


(377, 7)


Unnamed: 0,row_id,assessment_name,assessment_url,pdf_text,job_level,test_type,test_language
0,0,Global Skills Development Report,https://www.shl.com/products/product-catalog/v...,This report is designed to be given to individ...,"[Director, Entry-Level, Executive, General Pop...","[Ability & Aptitude, Biodata & Situational Jud...",[English (USA)]
1,1,.NET Framework 4.5,https://www.shl.com/products/product-catalog/v...,The.NET Framework 4.5 test measures knowledge ...,"[Mid-Professional, Professional Individual Con...",[Knowledge & Skills],[English (USA)]


Normalizing the Data

In [5]:
def norm_text(x) -> str:
    """Normalize any text/list/dict into a compact single-line string."""
    if isinstance(x, list):
        x = " ".join(map(str, x))
    elif isinstance(x, dict):
        x = json.dumps(x, ensure_ascii=False)
    elif x is None:
        x = ""
    else:
        x = str(x)
    x = x.strip()
    x = re.sub(r"\s+", " ", x)
    return x

# Apply to key columns (handles your sample lists in job_level/test_language)
for c in [COL_PDF, COL_LEVEL, COL_LANG, COL_NAME, COL_URL]:
    if c in df.columns:
        df[c] = df[c].map(norm_text)
    else:
        df[c] = ""  # ensure column exists


#### Making Embedding Cache & TF-IDF cache (build once, reuse)

In [None]:
def parquet_available():
    return (importlib.util.find_spec("pyarrow") is not None
            or importlib.util.find_spec("fastparquet") is not None)

def dataset_fingerprint(df: pd.DataFrame, cols=("row_id", COL_PDF, COL_LEVEL, COL_LANG)) -> str:
    """Stable fingerprint over key fields + model, to invalidate cache on data/model change."""
    h = hashlib.sha256()
    h.update(MODEL_NAME.encode("utf-8"))
    for c in cols:
        s = "\n".join(map(str, df[c].astype(str).tolist()))
        h.update(s.encode("utf-8"))
    return h.hexdigest()

def cache_paths(cache_dir: Path, fp: str):
    meta = cache_dir / f"meta_{fp}.json"
    np_pdf  = cache_dir / f"emb_pdf_{fp}.npy"
    np_lvl  = cache_dir / f"emb_level_{fp}.npy"
    np_lng  = cache_dir / f"emb_lang_{fp}.npy"
    rows_parquet = cache_dir / f"rows_{fp}.parquet"
    rows_csv     = cache_dir / f"rows_{fp}.csv"
    tfidf_pdf    = cache_dir / f"tfidf_pdf_{fp}.npz"
    tfidf_vocab  = cache_dir / f"tfidf_vocab_{fp}.json"
    return meta, np_pdf, np_lvl, np_lng, rows_parquet, rows_csv, tfidf_pdf, tfidf_vocab

def sanitize_rows_df(df_rows: pd.DataFrame, text_cols: List[str]) -> pd.DataFrame:
    out = df_rows.copy()
    for c in text_cols:
        if c in out.columns:
            out[c] = out[c].map(norm_text).astype(object)
    if "row_id" in out.columns:
        out["row_id"] = out["row_id"].astype(int)
    return out


def _utf8_write_text(path: str | Path, data: str):
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    # normalize to NFC to reduce weird ligatures; if still present, UTF-8 handles them
    data = unicodedata.normalize("NFC", data)
    with p.open("w", encoding="utf-8", newline="") as f:
        f.write(data)

def _utf8_to_csv(df, path):
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    # Use UTF-8 with BOM for Excel friendliness on Windows
    df.to_csv(p, index=False, encoding="utf-8-sig", line_terminator="\n")

def _parquet_available():
    return (importlib.util.find_spec("pyarrow") is not None
            or importlib.util.find_spec("fastparquet") is not None)

def save_cache(meta_path, np_pdf, np_lvl, np_lng,
               rows_parquet, rows_csv,
               emb_pdf, emb_level, emb_lang,
               df_rows, tfidf_matrix, tfidf_vocab_path, vocab_dict):
    # --- save embeddings ---
    np.save(np_pdf, emb_pdf); np.save(np_lvl, emb_level); np.save(np_lng, emb_lang)

    # --- save TF-IDF ---
    # you already have a separate tfidf_pdf path in your caller; keep using that
    # (if not, derive from tfidf_vocab_path similarly to earlier patch)
    tfidf_pdf_path = Path(str(tfidf_vocab_path).replace("tfidf_vocab_", "tfidf_pdf_")).with_suffix(".npz")
    sparse.save_npz(tfidf_pdf_path, tfidf_matrix)

    # vocab may contain numpy ints; coerce to plain int and write as UTF-8 JSON
    vocab_plain = {str(k): int(v) for k, v in vocab_dict.items()}
    _utf8_write_text(tfidf_vocab_path, json.dumps(vocab_plain, ensure_ascii=False))

    # --- sanitize rows then write parquet/CSV ---
    def _to_str(x):
        if isinstance(x, list):  x = " ".join(map(str, x))
        elif isinstance(x, dict): x = json.dumps(x, ensure_ascii=False)
        elif x is None: x = ""
        else: x = str(x)
        return unicodedata.normalize("NFC", x)

    text_cols = [COL_NAME, COL_URL, COL_PDF, COL_LEVEL, COL_LANG]
    rows = df_rows.copy()
    for c in text_cols:
        if c in rows.columns:
            rows[c] = rows[c].map(_to_str).astype(object)
    if "row_id" in rows.columns:
        rows["row_id"] = rows["row_id"].astype(int)

    rows_format = "csv"
    if _parquet_available():
        try:
            import pyarrow as pa, pyarrow.parquet as pq
            table = pa.Table.from_pandas(rows, preserve_index=False, safe=True)
            pq.write_table(table, rows_parquet)
            rows_format = "parquet"
        except Exception:
            _utf8_to_csv(rows, rows_csv)
    else:
        _utf8_to_csv(rows, rows_csv)

    meta = {
        "model": MODEL_NAME,
        "created_at": int(time.time()),
        "num_rows": int(rows.shape[0]),
        "rows_format": rows_format,
        "tfidf_npz": str(tfidf_pdf_path)
    }
    _utf8_write_text(meta_path, json.dumps(meta, ensure_ascii=False, indent=2))

def try_load_cache(meta_path, np_pdf, np_lvl, np_lng,
                   rows_parquet, rows_csv, tfidf_pdf, tfidf_vocab):
    p_meta, p_pdf, p_lvl, p_lng = map(Path, [meta_path, np_pdf, np_lvl, np_lng])
    if not (p_meta.exists() and p_pdf.exists() and p_lvl.exists() and p_lng.exists()):
        return None
    if not (Path(rows_parquet).exists() or Path(rows_csv).exists()):
        return None
    if not (Path(tfidf_pdf).exists() and Path(tfidf_vocab).exists()):
        return None
    try:
        meta = json.loads(Path(meta_path).read_text(encoding="utf-8"))
        emb_pdf   = np.load(np_pdf)
        emb_level = np.load(np_lvl)
        emb_lang  = np.load(np_lng)

        if meta.get("rows_format") == "parquet" and Path(rows_parquet).exists() and _parquet_available():
            import pyarrow.parquet as pq
            df_rows = pq.read_table(rows_parquet).to_pandas()
        else:
            df_rows = pd.read_csv(rows_csv, encoding="utf-8-sig")

        from scipy import sparse as _sp
        tfidf_mat = _sp.load_npz(tfidf_pdf)
        vocab_raw = json.loads(Path(tfidf_vocab).read_text(encoding="utf-8"))
        vocab = {str(k): int(v) for k, v in vocab_raw.items()}

        return meta, emb_pdf, emb_level, emb_lang, df_rows, tfidf_mat, vocab
    except Exception:
        return None


# Build or load cache
fp = dataset_fingerprint(df)
meta_path, np_pdf, np_lvl, np_lng, rows_parquet, rows_csv, tfidf_pdf, tfidf_vocab = cache_paths(CACHE_DIR, fp)

loaded = try_load_cache(meta_path, np_pdf, np_lvl, np_lng, rows_parquet, rows_csv, tfidf_pdf, tfidf_vocab)

if loaded is None:
    print("No valid cache. Building embeddings + TF-IDF...")
    emb_pdf   = model.encode(df[COL_PDF].tolist(),   convert_to_tensor=False, normalize_embeddings=True).astype("float32")
    emb_level = model.encode(df[COL_LEVEL].tolist(), convert_to_tensor=False, normalize_embeddings=True).astype("float32")
    emb_lang  = model.encode(df[COL_LANG].tolist(),  convert_to_tensor=False, normalize_embeddings=True).astype("float32")

    # TF-IDF over pdf_text (lexical signal)
    tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=100_000)
    tfidf_mat = tfidf.fit_transform(df[COL_PDF].tolist())
    tfidf_mat = normalize(tfidf_mat, norm="l2", copy=False)
    vocab = tfidf.vocabulary_

    save_cache(meta_path, np_pdf, np_lvl, np_lng,
               rows_parquet, rows_csv,
               emb_pdf, emb_level, emb_lang,
               df[["row_id", COL_NAME, COL_URL, COL_PDF, COL_LEVEL, COL_LANG]],
               tfidf_mat, tfidf_vocab, vocab)
    print("Cache saved.")
else:
    meta, emb_pdf, emb_level, emb_lang, df_cached, tfidf_mat, vocab = loaded
    print(f"Loaded cache: rows={meta['num_rows']}, model={meta['model']}, rows_format={meta['rows_format']}")
    # Align df to cached order if needed
    df = df.merge(df_cached[["row_id"]], on="row_id", how="right").sort_values("row_id").reset_index(drop=True)


Loaded cache: rows=377, model=sentence-transformers/all-MiniLM-L6-v2, rows_format=parquet


Domain priors (to boost recall for implied skills)

In [7]:
ROLE_HINTS: Dict[str, List[str]] = {
    "coo": ["leadership", "strategic thinking", "decision making", "people management", "operations excellence"],
    "chief operating officer": ["leadership", "strategy", "stakeholder management", "problem solving", "communication"],
    "engineering manager": ["leadership", "planning", "communication", "mentoring"],
    "product manager": ["prioritization", "stakeholder management", "communication", "analytical thinking"],
    "data scientist": ["statistics", "machine learning", "python", "sql", "experimentation"],
    # extend over time (active learning)
}

SKILL_SYNONYMS: Dict[str, List[str]] = {
    "sql": ["structured query language", "database querying", "rdbms"],
    "c++": ["c plus plus", "cpp"],
    "javascript": ["js", "ecmascript"],
    "python": ["py"],  # (optional: add pandas/numpy later if it helps)
    "leadership": ["people leadership", "team leadership", "influencing"],
    "aptitude": ["logical reasoning", "numerical reasoning", "verbal reasoning"],
    "behavior": ["work style", "personality", "culture fit"],
}

def expand_text_with_hints(looking_for: str, job_level: str, need_to_assess: str) -> str:
    base = looking_for.lower()
    expansions = []

    # role-based hints
    for k, v in ROLE_HINTS.items():
        if k in base:
            expansions.extend(v)

    # skill synonyms
    tokens = re.findall(r"[a-z0-9\+\#\.]+", base)
    for t in tokens:
        if t in SKILL_SYNONYMS:
            expansions.extend(SKILL_SYNONYMS[t])

    # seniority nudge
    if job_level.lower() in {"senior","lead","principal","executive"}:
        expansions.extend(["advanced", "complex scenarios"])

    # category letters nudge
    if "A" in need_to_assess: expansions.extend(["aptitude", "reasoning", "numerical", "verbal"])
    if "P" in need_to_assess: expansions.extend(["personality", "work style", "culture fit"])
    if "C" in need_to_assess: expansions.extend(["communication", "collaboration", "leadership", "problem solving"])

    expansions = list(dict.fromkeys(expansions))  # dedupe
    return looking_for + (" | HINTS: " + ", ".join(expansions) if expansions else "")


Scoring (semantic all steps + small TF-IDF boost) & Coverage

In [8]:
def cos_to_01(x: np.ndarray) -> np.ndarray:
    return (x + 1.0) / 2.0

def tfidf_query_vector(text: str, vocab: Dict[str,int]) -> sparse.csr_matrix:
    vec = TfidfVectorizer(vocabulary=vocab, ngram_range=(1,2))
    return vec.fit_transform([text])

def score_query(query: dict) -> pd.DataFrame:
    q_look = norm_text(query.get("looking_for",""))
    q_cons = norm_text(query.get("constraints",""))
    q_lvl  = norm_text(query.get("job_level",""))
    q_lang = norm_text(query.get("language",""))
    q_need = norm_text(query.get("need_to_assess",""))

    # expand looking_for with priors
    q_look_expanded = expand_text_with_hints(q_look, q_lvl, q_need)

    # embeddings (normalized) → cosine via dot
    e_look = model.encode(q_look_expanded, convert_to_tensor=False, normalize_embeddings=True) if q_look_expanded else None
    e_cons = model.encode(q_cons,           convert_to_tensor=False, normalize_embeddings=True) if q_cons else None
    e_lvl  = model.encode(q_lvl,            convert_to_tensor=False, normalize_embeddings=True) if q_lvl else None
    e_lang = model.encode(q_lang,           convert_to_tensor=False, normalize_embeddings=True) if q_lang else None

    n = len(df)
    s_look = np.full(n, 0.5, dtype="float32")
    s_cons = np.full(n, 0.5, dtype="float32")
    s_lvl  = np.full(n, 0.5, dtype="float32")
    s_lang = np.full(n, 0.5, dtype="float32")

    if e_look is not None: s_look = cos_to_01(emb_pdf @ e_look)
    if e_cons is not None: s_cons = cos_to_01(emb_pdf @ e_cons)
    if e_lvl  is not None: s_lvl  = cos_to_01(emb_level @ e_lvl)
    if e_lang is not None: s_lang = cos_to_01(emb_lang  @ e_lang)

    # TF-IDF hybrid (topic + constraints)
    s_look_tfidf = np.zeros(n, dtype="float32")
    s_cons_tfidf = np.zeros(n, dtype="float32")
    if q_look:
        qv = tfidf_query_vector(q_look_expanded, vocab)
        s_look_tfidf = (tfidf_mat @ qv.T).toarray().ravel().astype("float32")
        if s_look_tfidf.max() > 0: s_look_tfidf /= s_look_tfidf.max()
    if q_cons:
        qv2 = tfidf_query_vector(q_cons, vocab)
        s_cons_tfidf = (tfidf_mat @ qv2.T).toarray().ravel().astype("float32")
        if s_cons_tfidf.max() > 0: s_cons_tfidf /= s_cons_tfidf.max()

    final = (
        W_EMB_LOOK  * s_look +
        W_EMB_CONS  * s_cons +
        W_EMB_LANG  * s_lang +
        W_EMB_LEVEL * s_lvl  +
        W_TFIDF_LOOK * s_look_tfidf +
        W_TFIDF_CONS * s_cons_tfidf
    )

    out = df[[COL_NAME, COL_URL, COL_PDF, COL_LEVEL, COL_LANG]].copy()
    out["looking_for_score"] = s_look
    out["constraint_score"]  = s_cons
    out["language_score"]    = s_lang
    out["job_level_score"]   = s_lvl
    out["final_score"]       = final
    out = out.sort_values("final_score", ascending=False).reset_index(drop=True)
    return out

# Coverage helpers
def ensure_top_coverage(df_ranked: pd.DataFrame,
                        skills_required: List[str],
                        letters_required: List[str],
                        top_m:int=GUARANTEE_TOP,
                        also_ensure_in_topN: int=TOP_N) -> pd.DataFrame:
    dfw = df_ranked.copy()

    def contains_skill(text, skill):
        return skill.lower() in text.lower()

    LEX = {
        "A": ["aptitude", "reasoning", "numerical", "verbal"],
        "P": ["personality", "work style", "culture", "behavior"],
        "C": ["leadership", "communication", "collaboration", "problem solving"],
        "K": ["knowledge", "skills", "python", "sql", "java", "c++", "domain"],
        "S": ["simulation", "in-tray", "inbox", "case study"],
        "E": ["presentation", "group exercise", "written task"],
        "B": ["situational judgment", "past behavior"],
        "D": ["development", "360", "coaching"]
    }
    def contains_letter(text, letter):
        t = text.lower()
        return any(p in t for p in LEX.get(letter, []))

    # ensure skill coverage
    for skill in skills_required:
        if not any(contains_skill(dfw.loc[i, COL_PDF], skill) for i in range(min(top_m, len(dfw)))):
            for j in range(top_m, min(also_ensure_in_topN, len(dfw))):
                if contains_skill(dfw.loc[j, COL_PDF], skill):
                    row = dfw.iloc[[j]]
                    dfw = pd.concat([dfw.iloc[:top_m-1], row, dfw.drop(dfw.index[j]).iloc[top_m-1:]]).reset_index(drop=True)
                    break

    # ensure letter/category coverage
    for letter in letters_required:
        if not any(contains_letter(dfw.loc[i, COL_PDF], letter) for i in range(min(top_m, len(dfw)))):
            for j in range(top_m, min(also_ensure_in_topN, len(dfw))):
                if contains_letter(dfw.loc[j, COL_PDF], letter):
                    row = dfw.iloc[[j]]
                    dfw = pd.concat([dfw.iloc[:top_m-1], row, dfw.drop(dfw.index[j]).iloc[top_m-1:]]).reset_index(drop=True)
                    break

    return dfw


Time parser & package composer (multi-skill within budget)

In [9]:
def extract_minutes(text: str) -> int:
    """
    Heuristic: parse '45 minutes', '60 min', ranges '40-60 minutes'.
    Returns median of found numbers; 0 if none (treat as unknown).
    """
    t = text.lower()
    nums = []
    for m in re.finditer(r"(\d+)\s*-\s*(\d+)\s*(?:mins?|minutes)", t):
        a,b = int(m.group(1)), int(m.group(2)); nums.extend([a,b])
    for m in re.finditer(r"(\d+)\s*(?:mins?|minutes)", t):
        nums.append(int(m.group(1)))
    if not nums: return 0
    return int(np.median(nums))

def compose_package(df_ranked: pd.DataFrame,
                    skills: List[str],
                    time_budget: int) -> Tuple[List[dict], int]:
    """
    Greedy set cover under time budget:
    pick items that cover the most uncovered skills per minute, favoring higher-ranked ones.
    """
    chosen, covered, total_time = [], set(), 0
    items = []
    for _, r in df_ranked.iterrows():
        txt = r[COL_PDF]
        minutes = extract_minutes(txt) or 30  # fallback default if unknown
        item_skills = [s for s in skills if s.lower() in txt.lower()]
        items.append((r, minutes, set(map(str.lower, item_skills))))

    remaining = set(map(str.lower, skills))
    while remaining:
        best, best_gain = None, 0
        for r, minutes, item_skills in items:
            gain = len(item_skills & remaining)
            if gain <= 0: 
                continue
            if total_time + minutes > time_budget:
                continue
            # Utility: more coverage per minute, lightly reward better final_score
            utility = (gain * (1.0 + float(r["final_score"]))) / max(10, minutes)
            if utility > best_gain:
                best, best_gain = (r, minutes, item_skills), utility
        if best is None:
            break
        r, minutes, item_skills = best
        chosen.append({"name": r[COL_NAME], "url": r[COL_URL], "minutes": minutes, "covers": list(item_skills)})
        total_time += minutes
        remaining -= item_skills

    return chosen, total_time


End-to-end recommend()

In [10]:
def parse_skills_from_looking_for(text: str) -> List[str]:
    # Minimal keyword skill extractor; extend as needed
    candidates = ["python","sql","javascript","java","c++","react","excel","leadership","communication"]
    t = text.lower()
    return [s for s in candidates if s in t]

def recommend(query: dict):
    # 1) Score
    ranked = score_query(query)

    # 2) Coverage in top-N
    skills_req  = parse_skills_from_looking_for(query.get("looking_for",""))
    letters_req = [x.strip() for x in query.get("need_to_assess","").split(",") if x.strip()]
    ranked_cov  = ensure_top_coverage(ranked, skills_req, letters_req, top_m=GUARANTEE_TOP, also_ensure_in_topN=TOP_N)

    # 3) Package if max time in constraints (e.g., "max 60 minutes")
    time_budget = 0
    cons = query.get("constraints","").lower()
    m = re.search(r"(?:max|maximum)\s*(\d{1,3})\s*(?:mins?|minutes)", cons)
    if m:
        time_budget = int(m.group(1))

    package, total_time = [], 0
    if time_budget > 0 and len(skills_req) >= 2:
        package, total_time = compose_package(ranked_cov.head(50), skills_req, time_budget)

    # 4) Prepare topN with a brief why
    topN = ranked_cov.head(TOP_N).copy()
    def why(r):
        return (f"topic={r['looking_for_score']:.2f} | cons={r['constraint_score']:.2f} | "
                f"lang={r['language_score']:.2f} | level={r['job_level_score']:.2f}")
    topN["why"] = topN.apply(why, axis=1)

    return {
        "topN": topN[[COL_NAME, COL_URL, "final_score", "why"]],
        "skills_required": skills_req,
        "letters_required": letters_req,
        "package": package,
        "package_total_minutes": total_time,
        "time_budget": time_budget
    }


In [11]:
from Query_Restructured import get_assessment_summary

In [12]:
#Funtions to recheck and make sure output from get_assessment_summmary and recommend is in correct format

import json, re
import pandas as pd

def _robust_parse_summary(json_text):
    """Safely extract and parse the JSON block from LLM output."""
    if isinstance(json_text, dict):
        d = json_text
    else:
        if not isinstance(json_text, str):
            return {}
        start, end = json_text.find("{"), json_text.rfind("}")
        if start == -1 or end == -1 or end <= start:
            return {}
        try:
            d = json.loads(json_text[start:end+1])
        except Exception:
            return {}
    for k in ["looking_for","constraints","job_level","need_to_assess","language"]:
        d.setdefault(k, "")
        if not isinstance(d[k], str):
            d[k] = str(d[k])
    return d





In [65]:
Query = "To assess candidates proficient in Python, SQL, and JavaScript for a technical role, ensuring they also demonstrate a good cultural fit with the company."
prompt_new = get_assessment_summary(Query)
prompt_new

[ACK] 1/1: {"ack":"received"}


{'looking_for': 'Assess candidates proficient in Python, SQL, and JavaScript for a technical role.',
 'constraints': '',
 'job_level': 'unknown',
 'need_to_assess': 'K,P',
 'language': 'unknown'}

In [66]:
prompt_new = _robust_parse_summary(prompt_new)
prompt_new

{'looking_for': 'Assess candidates proficient in Python, SQL, and JavaScript for a technical role.',
 'constraints': '',
 'job_level': 'unknown',
 'need_to_assess': 'K,P',
 'language': 'unknown'}

In [91]:
prompt_new = {
  "looking_for": "A Research Engineer with expertise in AI/ML, including Natural Language Processing, computer vision, and generative AI, proficient in Python and ML frameworks like TensorFlow and PyTorch, to develop, prototype, and deploy robust AI/ML models and influence product roadmaps. The assessment should screen candidates' technical knowledge and practical application skills.",
  "constraints": "max assessment time in mins: 30",
  "job_level": "mid",
  "need_to_assess": "K,S,C,A",
  "language": "English"
}

In [92]:
TOP_N = 25

In [93]:
res1 = recommend(prompt_new)
display(res1["topN"])

Unnamed: 0,assessment_name,assessment_url,final_score,why
0,AI Skills,https://www.shl.com/products/product-catalog/v...,1.056579,topic=0.80 | cons=0.73 | lang=0.91 | level=0.52
1,Data Science (New),https://www.shl.com/products/product-catalog/v...,0.95375,topic=0.75 | cons=0.66 | lang=0.91 | level=0.61
2,Interpersonal Communications,https://www.shl.com/products/product-catalog/v...,0.905916,topic=0.68 | cons=0.64 | lang=0.91 | level=0.62
3,Multitasking Ability,https://www.shl.com/products/product-catalog/v...,0.855121,topic=0.69 | cons=0.68 | lang=0.91 | level=0.63
4,Python (New),https://www.shl.com/products/product-catalog/v...,0.891213,topic=0.73 | cons=0.63 | lang=0.91 | level=0.61
5,Job Control Language (New),https://www.shl.com/products/product-catalog/v...,0.884004,topic=0.72 | cons=0.67 | lang=0.91 | level=0.61
6,Visual Basic for Applications (New),https://www.shl.com/products/product-catalog/v...,0.880677,topic=0.68 | cons=0.63 | lang=0.91 | level=0.61
7,Verify - Verbal Ability - Next Generation,https://www.shl.com/products/product-catalog/v...,0.872818,topic=0.70 | cons=0.69 | lang=0.86 | level=0.60
8,Siebel Development (New),https://www.shl.com/products/product-catalog/v...,0.866747,topic=0.70 | cons=0.61 | lang=0.91 | level=0.61
9,Global Skills Assessment,https://www.shl.com/products/product-catalog/v...,0.861741,topic=0.68 | cons=0.68 | lang=0.71 | level=0.64


In [16]:
type(res1)

dict

### Testing Recall@10 for single query from Train Dataset

In [None]:
## Labeled urls from the dataset
check = {
    "Query": "I want to hire new graduates for a sales role in my company, the budget is for about an hour for each test. Give me some options",
    "assessment_url": [
      "https://www.shl.com/solutions/products/product-catalog/view/entry-level-sales-7-1/",
      "https://www.shl.com/solutions/products/product-catalog/view/entry-level-sales-sift-out-7-1/",
      "https://www.shl.com/solutions/products/product-catalog/view/entry-level-sales-solution/",
      "https://www.shl.com/solutions/products/product-catalog/view/sales-representative-solution/",
      "https://www.shl.com/products/product-catalog/view/business-communication-adaptive/",
      "https://www.shl.com/solutions/products/product-catalog/view/technical-sales-associate-solution/",
      "https://www.shl.com/solutions/products/product-catalog/view/svar-spoken-english-indian-accent-new/",
      "https://www.shl.com/products/product-catalog/view/interpersonal-communications/",
      "https://www.shl.com/solutions/products/product-catalog/view/english-comprehension-new/"
    ]
  }

In [77]:
import re, json
import pandas as pd
from IPython.display import display

def _normalize_url(u: str) -> str:
    if not isinstance(u, str): return ""
    s = u.strip().lower()
    s = s.replace("https://","").replace("http://","")
    if s.startswith("www."): s = s[4:]
    s = re.sub(r"[#?].*$", "", s)  # drop query/fragment
    if s.endswith("/"): s = s[:-1]
    return s

def _slug(u: str) -> str:
    u = _normalize_url(u)
    m = re.search(r"/view/([^/?#]+)", u)
    if m: return m.group(1)
    parts = [p for p in u.split("/") if p]
    return parts[-1] if parts else u

def _robust_parse_summary(json_text: str) -> dict:
    if not isinstance(json_text, str):
        return json_text if isinstance(json_text, dict) else {}
    start, end = json_text.find("{"), json_text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return {}
    try:
        data = json.loads(json_text[start:end+1])
    except Exception:
        return {}
    # coerce expected keys
    for k in ["looking_for","constraints","job_level","need_to_assess","language"]:
        data.setdefault(k, "")
        if not isinstance(data[k], str):
            data[k] = str(data[k])
    return data

def _extract_pred_urls_from_topN(topN, k: int = 10) -> list[str]:
    urls = []
    if isinstance(topN, pd.DataFrame):
        # try common URL columns
        for col in ["assessment_url","url","URL","link","href"]:
            if col in topN.columns:
                urls = topN[col].astype(str).head(k).tolist()
                break
    elif isinstance(topN, (list, tuple)) and topN:
        if isinstance(topN[0], dict):
            for col in ["assessment_url","url","URL","link","href"]:
                if col in topN[0]:
                    urls = [str(d.get(col,"")) for d in topN[:k]]
                    break
        else:
            urls = [str(x) for x in topN[:k]]
    return urls

def recall_at_k_single(check: dict, K: int = 10, verbose: bool = True) -> float:
    """
    check = {
        "Query": "...",
        "assessment_url": [list of ground-truth URLs]
    }
    Returns Recall@K (float). Prints details if verbose=True.
    """
    query_text = check.get("Query","")
    gt_urls = [u for u in check.get("assessment_url", []) if u]
    gt_slugs = {_slug(u) for u in gt_urls}

    # 1) Summarize query with your LLM
    prompt_out = get_assessment_summary(query_text)

    # 2) Call your recommender.
    #    Your current usage is recommend(prompt_string), but if it expects a dict
    #    we’ll parse and try recommend(parsed_dict) as a fallback.
    try:
        rec = recommend(prompt_out)
    except Exception:
        parsed = _robust_parse_summary(prompt_out)
        rec = recommend(parsed)

    topN = rec.get("topN", None) if isinstance(rec, dict) else rec
    pred_urls = _extract_pred_urls_from_topN(topN, k=K)
    pred_slugs = [_slug(u) for u in pred_urls if u]

    # 3) Compute recall
    hits = len(set(pred_slugs) & gt_slugs)
    denom = max(len(gt_slugs), 1)
    recall_k = hits / denom

    if verbose:
        print("="*80)
        print("Query:")
        print(query_text)
        print("\nParsed JSON from summarizer:")
        print(_robust_parse_summary(prompt_out))
        if isinstance(topN, pd.DataFrame):
            show_cols = [c for c in ["assessment_name","assessment_url","final_score","why"] if c in topN.columns]
            print("\nTop-K table:")
            display(topN[show_cols].head(K))
        print("\nGround-truth slugs:")
        print(sorted(gt_slugs))
        print("\nPredicted slugs:")
        print(pred_slugs)
        print(f"\nHits@{K}: {hits} / {len(gt_slugs)}  →  Recall@{K}: {recall_k:.4f}")

    return recall_k


In [78]:
recall_at_k_single(check, K=10, verbose=True)

[ACK] 1/1: {"ack":"received"}
Query:
I want to hire new graduates for a sales role in my company, the budget is for about an hour for each test. Give me some options

Parsed JSON from summarizer:
{'looking_for': 'A summary for new graduates applying for a sales role, focusing on assessing their communication, persuasion, and problem-solving skills.', 'constraints': 'average assessment time: 60 mins', 'job_level': 'junior', 'need_to_assess': 'C,B,P,S,A', 'language': 'unknown'}

Top-K table:


Unnamed: 0,assessment_name,assessment_url,final_score,why
0,Interpersonal Communications,https://www.shl.com/products/product-catalog/v...,0.938613,topic=0.74 | cons=0.65 | lang=0.60 | level=0.64
1,Business Communication (adaptive),https://www.shl.com/products/product-catalog/v...,0.88001,topic=0.71 | cons=0.64 | lang=0.60 | level=0.64
2,Multitasking Ability,https://www.shl.com/products/product-catalog/v...,0.852732,topic=0.68 | cons=0.67 | lang=0.60 | level=0.65
3,Retail Sales and Service Simulation,https://www.shl.com/products/product-catalog/v...,0.817455,topic=0.78 | cons=0.58 | lang=0.60 | level=0.68
4,Sales Interview Guide,https://www.shl.com/products/product-catalog/v...,0.833823,topic=0.81 | cons=0.54 | lang=0.65 | level=0.65
5,SHL Verify Interactive – Numerical Reasoning,https://www.shl.com/products/product-catalog/v...,0.826681,topic=0.68 | cons=0.66 | lang=0.55 | level=0.64
6,OPQ MQ Sales Report,https://www.shl.com/products/product-catalog/v...,0.821911,topic=0.81 | cons=0.54 | lang=0.55 | level=0.63
7,Graduate Scenarios,https://www.shl.com/products/product-catalog/v...,0.814403,topic=0.75 | cons=0.66 | lang=0.65 | level=0.64
8,Verify - General Ability Screen,https://www.shl.com/products/product-catalog/v...,0.798859,topic=0.72 | cons=0.67 | lang=0.54 | level=0.68
9,Occupational Personality Questionnaire OPQ32r,https://www.shl.com/products/product-catalog/v...,0.796134,topic=0.71 | cons=0.65 | lang=0.55 | level=0.64



Ground-truth slugs:
['business-communication-adaptive', 'english-comprehension-new', 'entry-level-sales-7-1', 'entry-level-sales-sift-out-7-1', 'entry-level-sales-solution', 'interpersonal-communications', 'sales-representative-solution', 'svar-spoken-english-indian-accent-new', 'technical-sales-associate-solution']

Predicted slugs:
['interpersonal-communications', 'business-communication-adaptive', 'multitasking-ability', 'retail-sales-and-service-simulation', 'sales-interview-guide', 'shl-verify-interactive-numerical-reasoning', 'opq-mq-sales-report', 'graduate-scenarios', 'verify-general-ability-screen', 'occupational-personality-questionnaire-opq32r']

Hits@10: 2 / 9  →  Recall@10: 0.2222


0.2222222222222222

### Recall@10 Evaluation from the whole train dataset

In [20]:
import os, glob

# Delete prediction cache files
for f in glob.glob("pred_cache.json*"):
    try:
        os.remove(f)
        print(f"Deleted cache file: {f}")
    except Exception as e:
        print(f"Could not delete {f}: {e}")

In [21]:
import re, json, time
import pandas as pd

# ------- CONFIG -------
EXCEL_PATH = r"C:\Users\vinee\Downloads\Gen_AI Dataset.xlsx"  # <-- your dataset path
QUERY_COL   = "Query"
URL_COL     = "Assessment_url"
K           = 10
SLEEP_SEC   = 20   # delay between each query

# ------- Helpers -------
def _normalize_url(u: str) -> str:
    if not isinstance(u, str): return ""
    s = u.strip().lower().replace("https://", "").replace("http://", "")
    if s.startswith("www."): s = s[4:]
    s = re.sub(r"[#?].*$", "", s)
    if s.endswith("/"): s = s[:-1]
    return s

def _slug(u: str) -> str:
    u = _normalize_url(u)
    m = re.search(r"/view/([^/?#]+)", u)
    if m: return m.group(1)
    parts = [p for p in u.split("/") if p]
    return parts[-1] if parts else u

def _robust_parse_summary(json_text: str) -> dict:
    """Extract first {...} object and coerce expected keys to strings. If already dict, pass through."""
    if isinstance(json_text, dict):
        d = json_text
    else:
        if not isinstance(json_text, str):
            return {}
        start, end = json_text.find("{"), json_text.rfind("}")
        if start == -1 or end == -1 or end <= start:
            return {}
        try:
            d = json.loads(json_text[start:end+1])
        except Exception:
            return {}
    for k in ["looking_for","constraints","job_level","need_to_assess","language"]:
        d.setdefault(k, "")
        if not isinstance(d[k], str): d[k] = str(d[k])
    return d

def _extract_pred_urls_from_topN(topN, k: int = 10) -> list[str]:
    urls = []
    if isinstance(topN, pd.DataFrame):
        for col in ["assessment_url","url","URL","link","href"]:
            if col in topN.columns:
                urls = topN[col].astype(str).head(k).tolist()
                break
    elif isinstance(topN, (list, tuple)) and topN:
        if isinstance(topN[0], dict):
            for col in ["assessment_url","url","URL","link","href"]:
                if col in topN[0]:
                    urls = [str(d.get(col,"")) for d in topN[:k]]
                    break
        else:
            urls = [str(x) for x in topN[:k]]
    return urls

def recall_at_k(true_slugs: set, pred_slugs: list, k: int) -> float:
    if not true_slugs: return 0.0
    return len(set(pred_slugs[:k]) & true_slugs) / len(true_slugs)

# ------- Load dataset -------
labels_df = pd.read_excel(EXCEL_PATH).fillna("")
if QUERY_COL not in labels_df.columns or URL_COL not in labels_df.columns:
    raise ValueError(f"Excel must have columns '{QUERY_COL}' and '{URL_COL}'")

labels_df["slug"] = labels_df[URL_COL].astype(str).map(_slug)
grouped = (
    labels_df.groupby(QUERY_COL)["slug"]
    .apply(lambda s: set(u for u in s if u))
    .reset_index()
    .rename(columns={"slug":"relevant_slugs"})
)

# ------- Evaluate silently -------
recall_results = {}

for i, row in grouped.iterrows():
    q_text = row[QUERY_COL]
    true_slugs = row["relevant_slugs"]

    prompt_out = get_assessment_summary(q_text)
    q_obj = _robust_parse_summary(prompt_out)

    try:
        rec = recommend(prompt_out)
    except Exception:
        rec = recommend(q_obj)

    topN = rec.get("topN", None) if isinstance(rec, dict) else rec
    pred_urls = _extract_pred_urls_from_topN(topN, k=K)
    pred_slugs = [_slug(u) for u in pred_urls if u]

    r_k = recall_at_k(true_slugs, pred_slugs, K)
    recall_results[q_text] = round(r_k, 4)

    # wait 20 seconds between queries
    if i < len(grouped) - 1:
        time.sleep(SLEEP_SEC)

# ------- Compute mean recall -------
mean_recall = round(sum(recall_results.values()) / len(recall_results), 4) if recall_results else 0.0
recall_results["Mean_Recall"] = mean_recall

# ------- Final JSON-style output -------
recall_results_json = json.dumps(recall_results, ensure_ascii=False, indent=2)
print(recall_results_json)


[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
[ACK] 1/1: {"ack":"received"}
{
  "Based on the JD below recommend me assessment for the Consultant position in my organizations. The assessment should not be more than 90 mins\nb Description\n\n Job Purpose \n\nResponsibilities\n\nThe Consultant role supports the broader professional services organization by leading the delivery of impactful client solutions and advising on client programs. Consultants are expected to deliver solutions, services, and insights that are driven by industry best practices and that positively impact client business objectives and partnerships. Individuals in this role provide I/O technical guidance to internal and external stakeholders, and drive continuous improvement related to project del