Web Scraping: Adaptive and Pre-packaged Assessments

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

# ---------------- Scraping Individual (Adaptive/IRT) Assessments ----------------
assessment_links = set()

# STEP 1: Collect all assessment links with pagination
while True:
    # wait for the assessment links to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[href*='/product-catalog/view/']")))
    time.sleep(2)

    # collect current page's assessment links
    for elem in driver.find_elements(By.CSS_SELECTOR, "a[href*='/product-catalog/view/']"):
        link = elem.get_attribute('href')
        if link:
            assessment_links.add(link)

    # find all "Next" buttons on page
    next_buttons = driver.find_elements(By.LINK_TEXT, "Next")

    # if there are at least 2, the second is the Individual Test Solutions pagination
    if len(next_buttons) >= 2:
        if next_buttons[1].get_attribute("aria-disabled") == "true":
            print("⛔ Reached last page; stopping.")
            break
        else:
            # click next page
            next_buttons[1].click()
            time.sleep(3)  # wait for new page to load
    else:
        print("No more pages to paginate. Finished collecting links.")
        break

print(f"Total unique assessments found: {len(assessment_links)}")

# STEP 2: Visit each assessment page and extract details
data = []

for link in assessment_links:
    driver.get(link)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # -------- Extract Title --------
    try:
        title_elem = driver.find_element(By.CSS_SELECTOR, "h1.main-title")
        title = title_elem.text.strip()
    except:
        title = None

    # -------- Extract Description --------
    try:
        desc_elem = driver.find_element(By.CSS_SELECTOR, "div.prose")
        description = desc_elem.text.strip()
    except:
        description = None

    # -------- Extract Job Levels --------
    job_levels = []
    try:
        levels = driver.find_elements(By.CSS_SELECTOR, "div.job-levels span")
        job_levels = [lvl.text.strip() for lvl in levels if lvl.text.strip()]
    except:
        pass

    # -------- Extract Languages --------
    languages = []
    try:
        langs = driver.find_elements(By.CSS_SELECTOR, "ul.languages li")
        languages = [lang.text.strip() for lang in langs if lang.text.strip()]
    except:
        pass

    # -------- Extract Assessment Length --------
    try:
        length_elem = driver.find_element(By.CSS_SELECTOR, "p.product-catalogue__small-text span.catalogue__text")
        length = length_elem.text.strip()
    except:
        length = None

    # -------- Extract Test Types (A, B, P, etc) --------
    try:
        p = driver.find_element(By.XPATH, "//p[normalize-space().startswith('Test Type')]")
        raw = p.text  # e.g. "Test Type: A B P"
        test_types = re.findall(r'\b[A-Z]\b', raw)
    except:
        test_types = []

    # -------- Extract Remote Testing (green dot) --------
    remote_testing = False
    try:
        for p in driver.find_elements(By.CSS_SELECTOR, "p.product-catalogue__small-text"):
            if p.text.strip().startswith("Remote Testing"):
                dots = p.find_elements(By.CSS_SELECTOR, "span.catalogue__circle.-yes")
                remote_testing = len(dots) > 0
                break
    except Exception:
        remote_testing = False

    # --------  Append Extracted Data --------
    data.append({
        'Title': title,
        'Description': description,
        'Job Levels': job_levels,
        'Languages': languages,
        'Assessment Length': length,
        'Test Type': ", ".join(test_types),
        'Remote Testing': remote_testing,
        'Adaptive/IRT': adaptive_map.get(link),  # <-- 🔥 New field here
        'URL': link
    })

# STEP 3: Save to CSV
df = pd.DataFrame(data)
df.to_csv('shl_pre_pack.csv', index=False)
print("✅ Saved to NEW_pre_pack.csv")
driver.quit()


CSV Merging

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd

# ---------------- Scraping Pre-packaged Assessments (Global Pre-packaged Table) ----------------
# 1) Point to Pre-packaged URL only
URL = "https://www.shl.com/products/product-catalog/?type=0"

# 2) Setup headless Chrome
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)
driver.get(URL)
time.sleep(2)

# 3) Initialize list to store assessments
assessments = []

while True:
    print("📄 Scraping current page…")

    # Extract rows from the Pre-packaged table
    table = driver.find_element(By.CSS_SELECTOR, "div.custom__table-responsive > table")
    rows = table.find_elements(By.CSS_SELECTOR, "tbody tr")
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) < 4:
            continue

        link = cells[0].find_element(By.TAG_NAME, "a")
        assessments.append({
            "Title":         link.text.strip(),
            "URL":           link.get_attribute("href"),
            "RemoteTesting": bool(cells[1].find_elements(By.CSS_SELECTOR, ".catalogue__circle.-yes")),
            "Adaptive/IRT":  bool(cells[2].find_elements(By.CSS_SELECTOR, ".catalogue__circle.-yes")),
            "TestTypes":     ", ".join([e.text for e in cells[3].find_elements(By.CSS_SELECTOR, ".product-catalogue__key")])
        })

    # 4) Handle pagination via the global pager
    try:
        next_li = driver.find_element(
            By.CSS_SELECTOR,
            "ul.pagination li.pagination__item.-arrow.-next"
        )
        # If this li has "-disabled", we’re done
        if "-disabled" in next_li.get_attribute("class"):
            print("⛔ Reached last page; stopping.")
            break

        # Otherwise click its <a> child to advance
        next_li.find_element(By.TAG_NAME, "a").click()
        time.sleep(3)  # wait for new page to load

    except:
        print("⚠️ Cannot find next arrow; exiting pagination loop.")
        break

# 5) Save to CSV
df = pd.DataFrame(assessments)
df.to_csv("shl_prepackaged_only.csv", index=False)
print(f"✅ Extracted {len(df)} rows → shl_prepackaged_only.csv")


In [None]:
import pandas as pd

# Merge Step 1: Load prepackaged and individual table CSVs
df1 = pd.read_csv('C:\\Users\\vidhi\\Downloads\\my_shl\\shl_prepackaged_no_duplicates_table.csv')
df2 = pd.read_csv('C:\\Users\\vidhi\\Downloads\\my_shl\\indiv_table_no_duplicates.csv')

# Concatenate both DataFrames row-wise
merged_df = pd.concat([df1, df2], ignore_index=True)

# Remove duplicate rows (if needed)
merged_df = merged_df.drop_duplicates()

# Save merged table (prepackaged + individual) to new CSV
merged_df.to_csv('merged_table_output.csv', index=False)


In [None]:
import pandas as pd

# Merge Step 2: Load the two CSV files with scraped data
df1 = pd.read_csv('C:\\Users\\vidhi\\Downloads\\my_shl\\shl_pre_pack.csv')
df2 = pd.read_csv('C:\\Users\\vidhi\\Downloads\\my_shl\\shl_individual_assess.csv')

# Concatenate both DataFrames row-wise
merged_df = pd.concat([df1, df2], ignore_index=True)

# Optional: Remove duplicate rows (if needed)
merged_df = merged_df.drop_duplicates()

# Save to new CSV
merged_df.to_csv('merged_detail_output.csv', index=False)
print("CSV files merged successfully and saved to 'merged_output.csv'")


In [None]:
import pandas as pd

# Merge Step 3: Load the merged detail and table CSVs
df1 = pd.read_csv('C:\\Users\\vidhi\\Downloads\\my_shl\\merged_detail_output.csv')
df2 = pd.read_csv('C:\\Users\\vidhi\\Downloads\\my_shl\\merged_table_output.csv')[['URL', 'Adaptive/IRT']]

# Merge Adaptive/IRT into df1 based on URL
merged_df = pd.merge(df1, df2, on='URL', how='left')

# Save the final merged CSV
merged_df.to_csv('final.csv', index=False)
print("Merged file saved as 'first_file_with_adaptive_irt.csv'")


Data Cleaning and Export

In [None]:
print("✅ Scraping complete. Starting data cleaning...")

import pandas as pd
import ast

# Load the merged dataset
df = pd.read_csv("shl_assessments_cleaned.csv")

# Helper function to clean list-like string columns
def clean_list_column(series):
    def safe_parse(val):
        try:
            parsed = ast.literal_eval(val)
            return [x.strip().lower() for x in parsed if isinstance(x, str) and x.strip()]
        except Exception:
            return []
    return series.apply(safe_parse)

# Clean list-type columns
list_columns = ['job_levels', 'languages', 'test_type']
for col in list_columns:
    df[col] = clean_list_column(df[col])

# Normalize boolean columns
df['remote_support'] = df['is_remote'].astype(str).str.upper().map({'TRUE': 'Yes', 'FALSE': 'No'})
df['adaptive_support'] = df['is_adaptive'].astype(str).str.upper().map({'TRUE': 'Yes', 'FALSE': 'No'})

# Ensure duration is numeric
df['duration_minutes'] = pd.to_numeric(df['duration_minutes'], errors='coerce')

# Drop rows with missing durations
df = df.dropna(subset=['duration_minutes'])

# Final cleaned dataframe
cleaned_df = df.drop(columns=['is_remote', 'is_adaptive'])
cleaned_df.to_csv("cleaned_dataset.csv", index=False)

print("✅ Dataset cleaned successfully.")


In [None]:
import pandas as pd
import re

# STEP 1: Collect all assessment links with pagination
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)
driver.get("https://www.shl.com/solutions/products/product-catalog/")
time.sleep(2)

assessment_links = set()
while True:
    # wait for the assessment links to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[href*='/product-catalog/view/']")))
    time.sleep(2)

    # collect current page's assessment links
    for elem in driver.find_elements(By.CSS_SELECTOR, "a[href*='/product-catalog/view/']"):
        link = elem.get_attribute('href')
        if link:
            assessment_links.add(link)

    # find all "Next" buttons on page
    next_buttons = driver.find_elements(By.LINK_TEXT, "Next")

    # if there are at least 2, the second is the Individual Test Solutions pagination
    if len(next_buttons) >= 2:
        if next_buttons[1].get_attribute("aria-disabled") == "true":
            print("⛔ Reached last page; stopping.")
            break
        else:
            next_buttons[1].click()
            time.sleep(3)
    else:
        print("No more pages to paginate. Finished collecting links.")
        break

print(f"Total unique assessments found: {len(assessment_links)}")

# STEP 2: Visit each assessment page and extract details
data = []

for link in assessment_links:
    driver.get(link)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extract details similarly as above
    try:
        title = driver.find_element(By.CSS_SELECTOR, "h1.main-title").text.strip()
    except:
        title = None
    try:
        description = driver.find_element(By.CSS_SELECTOR, "div.prose").text.strip()
    except:
        description = None
    job_levels = []
    try:
        levels = driver.find_elements(By.CSS_SELECTOR, "div.job-levels span")
        job_levels = [lvl.text.strip() for lvl in levels if lvl.text.strip()]
    except:
        pass
    languages = []
    try:
        langs = driver.find_elements(By.CSS_SELECTOR, "ul.languages li")
        languages = [lang.text.strip() for lang in langs if lang.text.strip()]
    except:
        pass
    try:
        length = driver.find_element(By.CSS_SELECTOR, "p.product-catalogue__small-text span.catalogue__text").text.strip()
    except:
        length = None
    try:
        p = driver.find_element(By.XPATH, "//p[normalize-space().startswith('Test Type')]")
        test_types = re.findall(r'\b[A-Z]\b', p.text)
    except:
        test_types = []
    remote_testing = False
    try:
        for p in driver.find_elements(By.CSS_SELECTOR, "p.product-catalogue__small-text"):
            if p.text.strip().startswith("Remote Testing"):
                dots = p.find_elements(By.CSS_SELECTOR, "span.catalogue__circle.-yes")
                remote_testing = len(dots) > 0
                break
    except:
        remote_testing = False

    data.append({
        "Title": title,
        "Description": description,
        "Job Levels": job_levels,
        "Languages": languages,
        "Duration": length,
        "TestTypes": ", ".join(test_types),
        "RemoteTesting": remote_testing,
        "URL": link
    })

# STEP 3: Save to CSV
df = pd.DataFrame(data)
df.to_csv('shl_individual_assess.csv', index=False)
print("✅ Saved to shl_individual_assess.csv")
driver.quit()


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# STEP 1: Collect all assessment links with pagination (second scraper)
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)
driver.get("https://www.shl.com/solutions/products/product-catalog/")
time.sleep(2)

assessment_links = set()
while True:
    # wait for the assessment links to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[href*='/product-catalog/view/']")))
    time.sleep(2)

    # collect current page's assessment links
    for elem in driver.find_elements(By.CSS_SELECTOR, "a[href*='/product-catalog/view/']"):
        link = elem.get_attribute('href')
        if link:
            assessment_links.add(link)

    # find all "Next" buttons on page
    next_buttons = driver.find_elements(By.LINK_TEXT, "Next")

    # if there are at least 2, the second is the Individual Test Solutions pagination
    if len(next_buttons) >= 2:
        if next_buttons[1].get_attribute("aria-disabled") == "true":
            print("⛔ Reached last page; stopping.")
            break
        else:
            next_buttons[1].click()
            time.sleep(3)
    else:
        print("No more pages to paginate. Finished collecting links.")
        break

print(f"Total unique assessments found: {len(assessment_links)}")

# STEP 2: Visit each assessment page and extract details
data = []
for link in assessment_links:
    driver.get(link)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    title = driver.find_element(By.CSS_SELECTOR, "h1.main-title").text.strip() if driver.find_elements(By.CSS_SELECTOR, "h1.main-title") else None
    description = driver.find_element(By.CSS_SELECTOR, "div.prose").text.strip() if driver.find_elements(By.CSS_SELECTOR, "div.prose") else None

    job_levels = [lvl.text.strip() for lvl in driver.find_elements(By.CSS_SELECTOR, "div.job-levels span") if lvl.text.strip()]

    languages = [lang.text.strip() for lang in driver.find_elements(By.CSS_SELECTOR, "ul.languages li") if lang.text.strip()]

    length = driver.find_element(By.CSS_SELECTOR, "p.product-catalogue__small-text span.catalogue__text").text.strip() if driver.find_elements(By.CSS_SELECTOR, "p.product-catalogue__small-text span.catalogue__text") else None

    try:
        raw = driver.find_element(By.XPATH, "//p[normalize-space().startswith('Test Type')]").text
        test_types = re.findall(r'\b[A-Z]\b', raw)
    except:
        test_types = []

    remote_testing = False
    try:
        for p in driver.find_elements(By.CSS_SELECTOR, "p.product-catalogue__small-text"):
            if p.text.strip().startswith("Remote Testing"):
                dots = p.find_elements(By.CSS_SELECTOR, "span.catalogue__circle.-yes")
                remote_testing = len(dots) > 0
                break
    except:
        remote_testing = False

    data.append({
        'Title': title,
        'Description': description,
        'Job Levels': job_levels,
        'Languages': languages,
        'Assessment Length': length,
        'TestTypes': ", ".join(test_types),
        'RemoteTesting': remote_testing,
        'Adaptive/IRT': None,  # no data in this loop
        'URL': link
    })

# STEP 3: Save to CSV
df = pd.DataFrame(data)
df.to_csv('shl_pre_pack.csv', index=False)
print("✅ Saved to shl_pre_pack.csv")
driver.quit()


Combined Text Column Generation

In [None]:
import re

def create_combined_text_column(csv_path, output_path="assessments_with_combined_text.csv"):
    df = pd.read_csv(csv_path)

    # Function to clean the title by removing numbers, brackets, and special characters
    def clean_title(text):
        text = re.sub(r"\d+", "", text)  # remove numbers
        text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)  # remove bracketed text
        text = re.sub(r"[^\w\s]", " ", text)  # remove special characters
        return text.strip()

    # Create combined text column
    df['combined_text'] = df['description'].fillna("") + " " + df['Title'].fillna("")
    df['combined_text'] = df['combined_text'].apply(clean_title)
    df = df.dropna(subset=['combined_text'])

    df.to_csv(output_path, index=False)
    print(f"✅ Combined text and cleaned title saved to {output_path}")
    return df

# Example run
create_combined_text_column("metadata_cleaned.csv")
