In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/content/shl_Data.csv")
df.head(2)

Unnamed: 0,Assessment Name,Link,Remote Testing,Adaptive/IRT,Test Type,Description,Job Levels,Languages,Assessment Length
0,Account Manager Solution,https://www.shl.com/solutions/products/product...,Yes,Yes,"C, P, A, B",The Account Manager solution is an assessment ...,"Mid-Professional,","English (USA),",49.0
1,Administrative Professional - Short Form,https://www.shl.com/solutions/products/product...,Yes,Yes,"A, K, P",The Administrative Professional solution is fo...,"Entry-Level,","English (USA),",36.0


In [3]:
!pip install chromadb sentence-transformers pandas -q

import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import math # For checking NaN
import re # For cleaning text

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m98.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[

In [4]:
# Rename columns to match required output fields where possible
df.rename(columns={
    'Link': 'url',
    'Assessment Name': 'name',
    'Remote Testing': 'remote_support',
    'Adaptive/IRT': 'adaptive_support',
    'Assessment Length': 'duration',
    'Test Type': 'test_type_raw', # Keep raw test types for now
    'Description': 'description'
}, inplace=True)

In [5]:
# Clean basic fields
df['description'].fillna('No description available.', inplace=True)
df['name'].fillna('Unnamed Assessment', inplace=True)
for col in ['remote_support', 'adaptive_support']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.lower()
        df[col] = df[col].apply(lambda x: 'Yes' if x == 'yes' else 'No')
    else: df[col] = 'No'
if 'duration' in df.columns:
    df['duration'] = pd.to_numeric(df['duration'], errors='coerce').fillna(0).astype(int)
else: df['duration'] = 0

# Clean and Map test_type
if 'test_type_raw' in df.columns:
    df['test_type_list'] = df['test_type_raw'].fillna('').astype(str).apply(
        lambda x: [t.strip() for t in x.split(',') if t.strip()]
    )
    type_mapping = {
        'A': 'Ability', 'B': 'Behavior', 'C': 'Cognitive', 'P': 'Personality',
        'S': 'Simulation', 'K': 'Knowledge & Skills', 'D': 'Development',
        'E': 'Exercise'
    }
    df['test_type_list'] = df['test_type_list'].apply(lambda types: list(set([type_mapping.get(t, t) for t in types])))
else:
    df['test_type_list'] = [[] for _ in range(len(df))]

# Drop invalid rows
df.dropna(subset=['url', 'name'], inplace=True)
df = df[df['url'].str.startswith('http')]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna('No description available.', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['name'].fillna('Unnamed Assessment', inplace=True)


In [6]:
# --- Prepare data for ChromaDB ---
documents = []
metadatas = []
ids = []
required_fields_for_api = ['url', 'adaptive_support', 'description', 'duration', 'remote_support'] # Note: test_type handled separately

for index, row in df.iterrows():
    doc_text = f"{row['name']}: {row['description']}"
    documents.append(re.sub(r'\s+', ' ', doc_text).strip())

    # Create metadata dict with basic types
    meta = {field: row[field] for field in required_fields_for_api if field in row}
    # Ensure correct types and defaults
    meta['url'] = str(meta.get('url', ''))
    meta['adaptive_support'] = str(meta.get('adaptive_support', 'No'))
    meta['description'] = str(meta.get('description', 'No description available.'))
    meta['duration'] = int(meta.get('duration', 0))
    meta['remote_support'] = str(meta.get('remote_support', 'No'))
    meta['name'] = str(row['name']) # Add name for convenience

    # *** CHANGE HERE: Serialize the test_type list to a JSON string ***
    test_type_list = row['test_type_list'] if 'test_type_list' in row else []
    meta['test_type_json'] = json.dumps(test_type_list) # Store as JSON string

    metadatas.append(meta)
    ids.append(f"shl_assessment_{index}") # Make sure IDs are strings

print(f"\nPrepared {len(documents)} documents for ChromaDB.")
if documents:
    print("Example Document:", documents[0][:200] + "...")
    print("Example Metadata (before adding to Chroma):", metadatas[0])
    print("Example ID:", ids[0])



Prepared 389 documents for ChromaDB.
Example Document: Account Manager Solution: The Account Manager solution is an assessment used for job candidates applying to mid-level leadership positions that tend to manage the day-to-day operations and activities ...
Example Metadata (before adding to Chroma): {'url': 'https://www.shl.com/solutions/products/product-catalog/view/account-manager-solution/', 'adaptive_support': 'Yes', 'description': 'The Account Manager solution is an assessment used for job candidates applying to mid-level leadership positions that tend to manage the day-to-day operations and activities of client accounts. Sample tasks for these jobs include, but are not limited to: communicating with clients about project status, developing and maintaining project plans, coordinating internally with appropriate project personnel, and ensuring client expectations are being met. Potential job titles that use this solution are: Account Executive, Account Manager, and Senior Accoun

In [7]:
# Step 5: Initialize Embedding Model
model_name = 'msmarco-distilbert-base-v4'
model = SentenceTransformer(model_name)
print(f"\nLoaded Sentence Transformer model: {model_name}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Loaded Sentence Transformer model: msmarco-distilbert-base-v4


In [8]:
# Step 6: Initialize ChromaDB Client and Collection

# Initialize a persistent client, specifying a directory for data storage
# Replace 'chroma_db_data' with the desired directory path
# If the directory doesn't exist, it will be created.
client = chromadb.PersistentClient(path="./chroma_db_data")

collection_name = "shl_assessments"

# Delete existing collection if it exists (optional, for fresh start during debugging)
try:
    client.delete_collection(name=collection_name)
    print(f"Deleted existing collection: {collection_name}")
except Exception as e:
    print(f"Collection {collection_name} did not exist or couldn't be deleted: {e}")

# Get or create collection (using the specified embedding function)
collection = client.get_or_create_collection(
    name=collection_name,
    embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)
)
print(f"ChromaDB collection '{collection_name}' created/retrieved.")

Collection shl_assessments did not exist or couldn't be deleted: Collection [shl_assessments] does not exists
ChromaDB collection 'shl_assessments' created/retrieved.


In [9]:
# Step 7: Add Data to ChromaDB (Embeddings generated automatically)
if collection.count() == 0 and len(ids) > 0:
    print(f"Adding {len(ids)} items to the collection...")
    try:
        collection.add(
            documents=documents,
            metadatas=metadatas, # Now contains JSON string for test_type
            ids=ids
        )
        print("Data added to ChromaDB.")
    except Exception as e:
        print(f"!!! ERROR adding data to ChromaDB: {e}") # Catch potential errors during add
elif len(ids) == 0:
     print("No valid data found to add to ChromaDB.")
else:
    print("Collection already contains data. Skipping add.")

print(f"Total items in collection: {collection.count()}")

Adding 389 items to the collection...
Data added to ChromaDB.
Total items in collection: 389


In [10]:
# Step 8: Query Function
def get_recommendations(query_text, top_n=10):
    """Queries ChromaDB and formats results."""
    if collection.count() == 0:
        print("Collection is empty. Cannot query.")
        return {"recommended_assessments": []}

    print(f"\nQuerying for: '{query_text}'")
    results = collection.query(
        query_texts=[query_text],
        n_results=min(top_n, collection.count()), # Ensure n_results <= collection size
        include=['metadatas', 'distances']
    )

    recommended_assessments = []
    if results and results.get('ids') and results['ids'][0]:
        for i, item_id in enumerate(results['ids'][0]):
            meta = results['metadatas'][0][i]
            distance = results['distances'][0][i]

            # *** CHANGE HERE: Parse the test_type JSON string back to a list ***
            test_type_list = []
            test_type_json_str = meta.get('test_type_json', '[]') # Default to empty JSON list string
            try:
                test_type_list = json.loads(test_type_json_str)
                if not isinstance(test_type_list, list): # Ensure it's actually a list
                    test_type_list = []
            except json.JSONDecodeError:
                print(f"Warning: Could not parse test_type_json for ID {item_id}: {test_type_json_str}")
                test_type_list = [] # Default to empty list on error

            # Format according to API spec
            formatted_result = {
                "url": meta.get('url', ''),
                "adaptive_support": meta.get('adaptive_support', 'No'),
                "description": meta.get('description', 'No description available.'),
                "duration": int(meta.get('duration', 0)),
                "remote_support": meta.get('remote_support', 'No'),
                "test_type": test_type_list, # Assign the parsed list here
            }
            recommended_assessments.append(formatted_result)

    # Ensure minimum 1, maximum 10 results as per requirement
    final_recommendations = recommended_assessments[:10] # Max 10
    if len(final_recommendations) == 0 and collection.count() > 0 and len(ids) > 0:
         # If query returned nothing, maybe return the *overall* top item as a fallback (min 1)
         print("Query returned no results, attempting fallback...")
         fallback_results = collection.peek(limit=1)
         if fallback_results and fallback_results.get('ids'):
              meta = fallback_results['metadatas'][0]
              test_type_list_fb = []
              test_type_json_str_fb = meta.get('test_type_json', '[]')
              try: test_type_list_fb = json.loads(test_type_json_str_fb)
              except: pass
              final_recommendations.append({
                 "url": meta.get('url', ''),
                 "adaptive_support": meta.get('adaptive_support', 'No'),
                 "description": meta.get('description', 'No description available.'),
                 "duration": int(meta.get('duration', 0)),
                 "remote_support": meta.get('remote_support', 'No'),
                 "test_type": test_type_list_fb if isinstance(test_type_list_fb, list) else [],
              })

    return {"recommended_assessments": final_recommendations}