# Embedding Analysis

Research notebook for evaluating embedding quality and comparing LLM classifications against embedding nearest-neighbor baselines.

**Requires:**
- `data/embeddings/voyage_mcp_emb_2026-01-22.npy` — Voyage-4-large MCP embeddings
- `data/embeddings/voyage_dwa_emb.npy` — Voyage-4-large DWA embeddings
- `data/embeddings/mpnet_mcp_emb.npy` — all-mpnet-base-v2 MCP embeddings (optional)
- `data/embeddings/mpnet_dwa_emb.npy` — all-mpnet-base-v2 DWA embeddings (optional)
- `data/mcp/mcp_data_2026-01-22.csv` — cleaned MCP data
- `data/onet/onet_data.csv` — O*NET tasks
- `data/mcp/mcp_classification_teddy.csv` — manual ground-truth classifications
- `data/mcp/gpt-4.1_v5.2_occ_gwa_iwa_dwa_task.csv` — GPT-4.1 v5.2 classifications

The mpnet embeddings are not generated by the main pipeline. To regenerate them:
```python
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-mpnet-base-v2')
embs = np.array(model.encode(texts, show_progress_bar=True), dtype=np.float32)
np.save('data/embeddings/mpnet_mcp_emb.npy', embs)
```

In [1]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

DATA_DIR = Path("../data")

# ---------- Load MCP + O*NET data ----------
# The full Jan 2026 dataset used for the original classification run
mcp_df = pd.read_csv(DATA_DIR / "mcp/raw/mcp_data_2026-01-22.csv")
onet_df = pd.read_csv(DATA_DIR / "onet/onet_data.csv")

mcp_titles = mcp_df["title"].tolist()
dwa_titles_unique = onet_df["dwa_title"].dropna().drop_duplicates().reset_index(drop=True).tolist()

print(f"MCPs loaded:         {len(mcp_df):,}")
print(f"Unique DWA titles:   {len(dwa_titles_unique):,}")

# ---------- Load Voyage embeddings ----------
# voyage_mcp_emb.npy = original full-dataset embeddings (8,957 MCPs x 1024-dim)
voyage_mcp_emb = np.load(DATA_DIR / "embeddings/voyage_mcp_emb.npy")
voyage_dwa_emb = np.load(DATA_DIR / "embeddings/voyage_dwa_emb.npy")
print(f"Voyage MCP emb:  {voyage_mcp_emb.shape}")
print(f"Voyage DWA emb:  {voyage_dwa_emb.shape}")

# ---------- Load mpnet embeddings (optional) ----------
mpnet_mcp_path = DATA_DIR / "embeddings/mpnet_mcp_emb.npy"
mpnet_dwa_path = DATA_DIR / "embeddings/mpnet_dwa_emb.npy"
if mpnet_mcp_path.exists() and mpnet_dwa_path.exists():
    mpnet_mcp_emb = np.load(mpnet_mcp_path)
    mpnet_dwa_emb = np.load(mpnet_dwa_path)
    print(f"mpnet MCP emb:   {mpnet_mcp_emb.shape}")
    print(f"mpnet DWA emb:   {mpnet_dwa_emb.shape}")
    MPNET_AVAILABLE = True
else:
    print("mpnet embeddings not found — skipping mpnet sections.")
    MPNET_AVAILABLE = False

# ---------- L2 normalize ----------
def l2_normalize(X):
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    norms[norms == 0] = 1
    return X / norms

voyage_mcp_norm = l2_normalize(voyage_mcp_emb)
voyage_dwa_norm = l2_normalize(voyage_dwa_emb)
if MPNET_AVAILABLE:
    mpnet_mcp_norm = l2_normalize(mpnet_mcp_emb)
    mpnet_dwa_norm = l2_normalize(mpnet_dwa_emb)

print("\nSetup complete.")


MCPs loaded:         8,957
Unique DWA titles:   2,083
Voyage MCP emb:  (8957, 1024)
Voyage DWA emb:  (2083, 1024)
mpnet MCP emb:   (8957, 768)
mpnet DWA emb:   (2083, 768)

Setup complete.


## Cosine Similarity Distribution Analysis

In [3]:
# ============================================================
# Cosine Similarity Distribution Analysis
# ============================================================

def print_stats(label, vals):
    print(f"    Mean:        {np.mean(vals):.4f}")
    print(f"    Median:      {np.median(vals):.4f}")
    print(f"    Std Dev:     {np.std(vals):.4f}")
    print(f"    Min:         {np.min(vals):.4f}")
    print(f"    Max:         {np.max(vals):.4f}")
    print(f"    5th pctile:  {np.percentile(vals, 5):.4f}")
    print(f"    25th pctile: {np.percentile(vals, 25):.4f}")
    print(f"    75th pctile: {np.percentile(vals, 75):.4f}")
    print(f"    95th pctile: {np.percentile(vals, 95):.4f}")


def run_distribution_analysis(mcp_emb, dwa_emb, model_name):
    mcp_norm = l2_normalize(mcp_emb)
    dwa_norm = l2_normalize(dwa_emb)

    n_mcp = mcp_norm.shape[0]
    n_dwa = dwa_norm.shape[0]

    print("=" * 70)
    print(f"COSINE SIMILARITY DISTRIBUTION ANALYSIS  --  {model_name}")
    print(f"  {n_mcp:,} MCP servers  |  {n_dwa:,} unique DWA titles")
    print("=" * 70)

    TOP_K = 10
    chunk_size = 500
    top1_sims = []
    topk_means = []
    all_means = []
    rank_sims = [[] for _ in range(TOP_K)]

    for i in range(0, n_mcp, chunk_size):
        chunk = mcp_norm[i : i + chunk_size]
        sims = chunk @ dwa_norm.T

        for row in sims:
            sorted_row = np.sort(row)[::-1]
            top1_sims.append(sorted_row[0])
            topk_means.append(sorted_row[:TOP_K].mean())
            all_means.append(row.mean())
            for k in range(TOP_K):
                rank_sims[k].append(sorted_row[k])

    top1_sims = np.array(top1_sims)
    topk_means = np.array(topk_means)
    all_means = np.array(all_means)

    print()
    print("--- A. MCP Embedding -> DWA Embeddings ---")
    print()
    print("  Top-1 (closest single DWA)")
    print_stats("top1", top1_sims)

    print()
    print(f"  Top-{TOP_K} mean (avg of {TOP_K} closest)")
    print_stats(f"top{TOP_K}", topk_means)

    print()
    print("  All-DWA mean (baseline avg across all DWAs)")
    print_stats("all", all_means)

    print()
    print(f"  Similarity drop-off by rank position (1=closest, {TOP_K}={TOP_K}th closest):")
    print(f"    {'Rank':>8}   {'Mean':>6}   {'Median':>6}      {'Std':>5}")
    for k in range(TOP_K):
        arr = np.array(rank_sims[k])
        print(f"    {k+1:>8}   {arr.mean():.4f}   {np.median(arr):.4f}   {arr.std():.4f}")

    print()
    return mcp_norm, dwa_norm


run_distribution_analysis(voyage_mcp_emb, voyage_dwa_emb, "Voyage-4-large")

if MPNET_AVAILABLE:
    run_distribution_analysis(mpnet_mcp_emb, mpnet_dwa_emb, "all-mpnet-base-v2")

COSINE SIMILARITY DISTRIBUTION ANALYSIS  --  Voyage-4-large
  8,957 MCP servers  |  2,083 unique DWA titles

--- A. MCP Embedding -> DWA Embeddings ---

  Top-1 (closest single DWA)
    Mean:        0.4382
    Median:      0.4325
    Std Dev:     0.0507
    Min:         0.2903
    Max:         0.6904
    5th pctile:  0.3667
    25th pctile: 0.4035
    75th pctile: 0.4659
    95th pctile: 0.5326

  Top-10 mean (avg of 10 closest)
    Mean:        0.4009
    Median:      0.3984
    Std Dev:     0.0412
    Min:         0.2739
    Max:         0.6454
    5th pctile:  0.3392
    25th pctile: 0.3729
    75th pctile: 0.4247
    95th pctile: 0.4713

  All-DWA mean (baseline avg across all DWAs)
    Mean:        0.2386
    Median:      0.2379
    Std Dev:     0.0344
    Min:         0.1223
    Max:         0.4287
    5th pctile:  0.1843
    25th pctile: 0.2171
    75th pctile: 0.2590
    95th pctile: 0.2911

  Similarity drop-off by rank position (1=closest, 10=10th closest):
        Rank     M

## Example Similarity Pairs (Eyeball Check)

In [4]:
# ============================================================
# Example pairs at different similarity thresholds
# For each model: show 2 examples each for DWA<->DWA,
# MCP<->MCP, and MCP<->DWA at HIGH and MEDIUM similarity
# ============================================================

def show_example_pairs(mcp_norm, dwa_norm, mcp_titles_list, dwa_titles_list, model_name):
    print("=" * 70)
    print(f"EXAMPLE SIMILARITY PAIRS  --  {model_name}")
    print("=" * 70)

    # --- DWA <-> DWA ---
    print("\n--- DWA <-> DWA ---")
    dwa_sims = dwa_norm @ dwa_norm.T
    np.fill_diagonal(dwa_sims, 0)

    flat_idx = np.argmax(dwa_sims)
    i, j = divmod(flat_idx, dwa_sims.shape[1])
    print(f"\n  HIGH similarity ({dwa_sims[i, j]:.4f}):")
    print(f"    A: {dwa_titles_list[i]}")
    print(f"    B: {dwa_titles_list[j]}")

    upper = dwa_sims[np.triu_indices_from(dwa_sims, k=1)]
    median_sim = np.median(upper)
    diff = np.abs(dwa_sims - median_sim)
    np.fill_diagonal(diff, 999)
    flat_idx = np.argmin(diff)
    i, j = divmod(flat_idx, dwa_sims.shape[1])
    print(f"\n  MEDIUM similarity ({dwa_sims[i, j]:.4f}, median={median_sim:.4f}):")
    print(f"    A: {dwa_titles_list[i]}")
    print(f"    B: {dwa_titles_list[j]}")

    # --- MCP <-> MCP ---
    print("\n--- MCP <-> MCP ---")
    sample_n = min(500, len(mcp_titles_list))
    rng = np.random.default_rng(42)
    sample_idx = rng.choice(len(mcp_titles_list), sample_n, replace=False)
    mcp_sample = mcp_norm[sample_idx]
    mcp_sample_titles = [mcp_titles_list[i] for i in sample_idx]

    mcp_sims = mcp_sample @ mcp_sample.T
    np.fill_diagonal(mcp_sims, 0)

    flat_idx = np.argmax(mcp_sims)
    i, j = divmod(flat_idx, mcp_sims.shape[1])
    print(f"\n  HIGH similarity ({mcp_sims[i, j]:.4f}):")
    print(f"    A: {mcp_sample_titles[i]}")
    print(f"    B: {mcp_sample_titles[j]}")

    upper = mcp_sims[np.triu_indices_from(mcp_sims, k=1)]
    median_sim = np.median(upper)
    diff = np.abs(mcp_sims - median_sim)
    np.fill_diagonal(diff, 999)
    flat_idx = np.argmin(diff)
    i, j = divmod(flat_idx, mcp_sims.shape[1])
    print(f"\n  MEDIUM similarity ({mcp_sims[i, j]:.4f}, median={median_sim:.4f}):")
    print(f"    A: {mcp_sample_titles[i]}")
    print(f"    B: {mcp_sample_titles[j]}")

    # --- MCP <-> DWA ---
    print("\n--- MCP <-> DWA ---")
    cross_sims = mcp_sample @ dwa_norm.T

    flat_idx = np.argmax(cross_sims)
    i, j = divmod(flat_idx, cross_sims.shape[1])
    print(f"\n  HIGH similarity ({cross_sims[i, j]:.4f}):")
    print(f"    MCP: {mcp_sample_titles[i]}")
    print(f"    DWA: {dwa_titles_list[j]}")

    upper = cross_sims.flatten()
    median_sim = np.median(upper)
    diff = np.abs(cross_sims - median_sim)
    flat_idx = np.argmin(diff)
    i, j = divmod(flat_idx, cross_sims.shape[1])
    print(f"\n  MEDIUM similarity ({cross_sims[i, j]:.4f}, median={median_sim:.4f}):")
    print(f"    MCP: {mcp_sample_titles[i]}")
    print(f"    DWA: {dwa_titles_list[j]}")

    print()


show_example_pairs(voyage_mcp_norm, voyage_dwa_norm, mcp_titles, dwa_titles_unique, "Voyage-4-large")

if MPNET_AVAILABLE:
    show_example_pairs(mpnet_mcp_norm, mpnet_dwa_norm, mcp_titles, dwa_titles_unique, "all-mpnet-base-v2")

EXAMPLE SIMILARITY PAIRS  --  Voyage-4-large

--- DWA <-> DWA ---

  HIGH similarity (0.9945):
    A: Operate audiovisual equipment.
    B: Operate audio-visual equipment.

  MEDIUM similarity (0.5332, median=0.5332):
    A: Create new recipes or food presentations.
    B: Establish business management methods.

--- MCP <-> MCP ---

  HIGH similarity (0.9911):
    A: test-server MCP server
    B: mcp-client-and-server MCP server

  MEDIUM similarity (0.5484, median=0.5484):
    A: awesome-mcp-ranking-reviews
    B: Octopus Deploy Mcp Server

--- MCP <-> DWA ---

  HIGH similarity (0.6205):
    MCP: Zomato MCP
    DWA: Take customer orders.

  MEDIUM similarity (0.2350, median=0.2350):
    MCP: Vibe Coding Buddy
    DWA: Develop business relationships.

EXAMPLE SIMILARITY PAIRS  --  all-mpnet-base-v2

--- DWA <-> DWA ---

  HIGH similarity (0.9796):
    A: Clean facilities or equipment.
    B: Clean equipment or facilities.

  MEDIUM similarity (0.2560, median=0.2560):
    A: Direct act

## Classification Comparison: Manual + GPT-4.1 v5.2 vs Embedding Nearest Neighbors

For each (MCP, selected DWA) pair in the ground-truth classifications, this section computes the cosine similarity rank of that DWA relative to all 2,083 unique DWAs. This validates how well the embedding retrieval step captures the correct DWAs.

In [5]:
# ============================================================
# Compare Teddy & GPT-4.1 DWA selections against embedding
# nearest neighbors.
# ============================================================

teddy_class = pd.read_csv(DATA_DIR / "mcp/mcp_classification_teddy.csv")
gpt_class = pd.read_csv(DATA_DIR / "mcp/gpt-4.1_v5.2_occ_gwa_iwa_dwa_task.csv")

print(f"Teddy classifications loaded: {len(teddy_class)} MCPs")
print(f"GPT-4.1 classifications loaded: {len(gpt_class)} MCPs")

# --- Build lookup tables ---
def normalize_dwa_text(text):
    text = text.strip().lower()
    text = text.rstrip(".")
    text = re.sub(r"[,;]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

dwa_lookup = {normalize_dwa_text(d): i for i, d in enumerate(dwa_titles_unique)}
mcp_title_to_idx = {str(t).strip(): i for i, t in enumerate(mcp_titles)}
mcp_url_to_idx = {str(u).strip(): i for i, u in enumerate(mcp_df["url"].values)}

print(f"DWA lookup:  {len(dwa_lookup):,} entries")
print(f"MCP lookup:  {len(mcp_title_to_idx):,} entries")


def compute_dwa_ranks(class_df, source_name, mcp_norm, dwa_norm, dwa_col="dwa"):
    results = []
    n_matched_mcps = 0
    n_unmatched_mcps = 0
    n_matched_dwas = 0
    n_unmatched_dwas = 0
    unmatched_examples = []

    valid = class_df[
        class_df[dwa_col].notna()
        & (class_df[dwa_col].astype(str).str.strip() != "")
        & (class_df["occ_relevant"].astype(str).str.strip().str.lower() == "yes")
    ].copy()

    for _, row in valid.iterrows():
        title = str(row["title"]).strip()
        url = str(row.get("url", "")).strip()

        mcp_idx = mcp_title_to_idx.get(title)
        if mcp_idx is None:
            mcp_idx = mcp_url_to_idx.get(url)
        if mcp_idx is None:
            n_unmatched_mcps += 1
            continue

        mcp_vec = mcp_norm[mcp_idx : mcp_idx + 1]
        n_matched_mcps += 1

        all_sims = (mcp_vec @ dwa_norm.T).flatten()
        sorted_indices = np.argsort(-all_sims)

        rank_lookup = np.empty(len(all_sims), dtype=int)
        rank_lookup[sorted_indices] = np.arange(1, len(all_sims) + 1)

        top1_sim = float(all_sims[sorted_indices[0]])
        top1_dwa_text = dwa_titles_unique[sorted_indices[0]]

        dwa_strings = [t.strip() for t in str(row[dwa_col]).split(";") if t.strip()]

        for dwa_str in dwa_strings:
            norm_key = normalize_dwa_text(dwa_str)
            if norm_key in dwa_lookup:
                dwa_idx = dwa_lookup[norm_key]
                sim = float(all_sims[dwa_idx])
                rank = int(rank_lookup[dwa_idx])
                n_matched_dwas += 1
                results.append({
                    "source": source_name,
                    "mcp_title": title,
                    "selected_dwa": dwa_str,
                    "cosine_similarity": round(sim, 4),
                    "rank": rank,
                    "total_dwas": len(dwa_titles_unique),
                    "percentile": round((1 - rank / len(dwa_titles_unique)) * 100, 2),
                    "top1_similarity": round(top1_sim, 4),
                    "top1_dwa": top1_dwa_text,
                    "sim_gap_from_top1": round(top1_sim - sim, 4),
                })
            else:
                n_unmatched_dwas += 1
                unmatched_examples.append((title, dwa_str))

    print(f"\n  {source_name}")
    print(f"  MCPs matched: {n_matched_mcps}  |  not found: {n_unmatched_mcps}")
    print(f"  DWAs matched: {n_matched_dwas}  |  not matched: {n_unmatched_dwas}")
    if unmatched_examples:
        print("  Sample unmatched DWAs:")
        for title, dwa in unmatched_examples[:5]:
            print(f"    [{title}] {dwa[:80]}")

    return pd.DataFrame(results)


def print_rank_summary(label, df):
    if df.empty:
        print(f"{label}: No matched DWAs to analyze.")
        return
    print()
    print("=" * 60)
    print(f"  {label} -- Summary Statistics")
    print("=" * 60)
    print(f"  Total (MCP, DWA) pairs: {len(df):,}")
    print(f"  Unique MCPs:            {df['mcp_title'].nunique()}")
    print()
    print("  Cosine Similarity of Selected DWAs:")
    print(f"    Mean:    {df['cosine_similarity'].mean():.4f}")
    print(f"    Median:  {df['cosine_similarity'].median():.4f}")
    print()
    print(f"  Rank among {df['total_dwas'].iloc[0]:,} unique DWAs:")
    print(f"    Mean rank:   {df['rank'].mean():.0f}")
    print(f"    Median rank: {df['rank'].median():.0f}")
    print(f"    % in top 10:   {(df['rank'] <= 10).mean()*100:.1f}%")
    print(f"    % in top 50:   {(df['rank'] <= 50).mean()*100:.1f}%")
    print(f"    % in top 80:   {(df['rank'] <= 80).mean()*100:.1f}%")
    print(f"    % in top 100:  {(df['rank'] <= 100).mean()*100:.1f}%")


models_to_run = [('Voyage-4-large', voyage_mcp_norm, voyage_dwa_norm)]
if MPNET_AVAILABLE:
    models_to_run.append(('all-mpnet-base-v2', mpnet_mcp_norm, mpnet_dwa_norm))

for model_label, mcp_n, dwa_n in models_to_run:
    print('\n' + '#' * 70)
    print(f'#  {model_label}')
    print('#' * 70)

    teddy_ranks = compute_dwa_ranks(teddy_class, f'Teddy (manual) [{model_label}]', mcp_n, dwa_n)
    gpt_ranks = compute_dwa_ranks(gpt_class, f'GPT-4.1 (v5.2) [{model_label}]', mcp_n, dwa_n)

    print_rank_summary(f'Teddy (manual) [{model_label}]', teddy_ranks)
    print_rank_summary(f'GPT-4.1 (v5.2) [{model_label}]', gpt_ranks)


Teddy classifications loaded: 30 MCPs
GPT-4.1 classifications loaded: 30 MCPs
DWA lookup:  2,083 entries
MCP lookup:  7,458 entries

######################################################################
#  Voyage-4-large
######################################################################

  Teddy (manual) [Voyage-4-large]
  MCPs matched: 21  |  not found: 4
  DWAs matched: 43  |  not matched: 0

  GPT-4.1 (v5.2) [Voyage-4-large]
  MCPs matched: 24  |  not found: 4
  DWAs matched: 101  |  not matched: 4
  Sample unmatched DWAs:
    [SumoLogic MCP Server] Prepare reports of operational or procedural activities.
    [mxHERO Mail2Cloud MCP] Gather information from physical or electronic sources
    [mcp-fastify-server] Process information or data.
    [mcp_py_exam] Document technical designs, procedures, or activities.

  Teddy (manual) [Voyage-4-large] -- Summary Statistics
  Total (MCP, DWA) pairs: 43
  Unique MCPs:            21

  Cosine Similarity of Selected DWAs:
    Mean:    0.

## Human Validation: Pipeline vs Manual Classification

Compares the automated pipeline results (GPT-4.1 + Voyage-4-large, `mcp_results_*.csv`) against
Teddy's manual classifications (`mcp_classification_teddy.csv`) on overlapping MCP servers.

**Teddy's selected tasks are treated as having an implicit rating of 5** (fully automatable / relevant).

Analyses:
1. **DWA overlap** — Are Teddy’s selected DWAs recovered in the pipeline’s selected / used DWA sets?
2. **Task context coverage** — Do Teddy-assigned tasks appear in the pipeline’s task context?
3. **Rating agreement** — How does the pipeline rate Teddy’s tasks vs Teddy’s implicit 5?
4. **GWA / IWA alignment** — Do the pipeline DWAs map to the same broader work-activity categories?

In [6]:
# ============================================================
# Load pipeline results + O*NET hierarchy; find overlapping MCPs
# ============================================================
import glob as _glob

# Auto-detect latest mcp_results file
_results_files = sorted(_glob.glob(str(DATA_DIR / "mcp/results/mcp_results_*.csv")))
assert _results_files, "No mcp_results_*.csv found in data/mcp/results/"
mcp_results_path = _results_files[-1]
mcp_results = pd.read_csv(mcp_results_path)
print(f"Pipeline results: {len(mcp_results):,} MCPs  ({Path(mcp_results_path).name})")

# Load DWA -> IWA -> GWA hierarchy
tasks_hierarchy = pd.read_csv(DATA_DIR / "onet/tasks_dwa_iwa_gwa.csv")
print(f"Tasks hierarchy:  {len(tasks_hierarchy):,} rows  "
      f"({tasks_hierarchy['dwa_title'].nunique():,} unique DWAs)")

# teddy_class already loaded above (cell a008comp)
teddy_occ = teddy_class[
    teddy_class["occ_relevant"].astype(str).str.strip().str.lower() == "yes"
].copy()
print(f"Teddy labeled MCPs (occ_relevant=yes): {len(teddy_occ)}  "
      f"({teddy_occ['title'].nunique()} unique MCPs)")

# --- Text normalization helpers ---
_norm = normalize_dwa_text   # reuses function from cell a008comp


def norm_task(t):
    """Light normalization for O*NET task text: lowercase + strip trailing period."""
    return str(t).strip().lower().rstrip(".")


def split_cell(val, sep=";"):
    """Split a delimited cell value; return list of stripped non-empty strings."""
    if pd.isna(val) or not str(val).strip() or str(val).strip().lower() == "nan":
        return []
    return [v.strip() for v in str(val).split(sep) if v.strip() and v.strip().lower() != "nan"]


def parse_sc_set(val, normalizer=None, sep=";"):
    """Parse a semicolon-delimited string into a set of normalized tokens."""
    normalizer = normalizer or _norm
    return {normalizer(v) for v in split_cell(val, sep)}


# --- Build DWA title -> {iwa, gwa} lookup (first occurrence wins) ---
dwa_to_hier = {}
for _, _r in tasks_hierarchy.iterrows():
    _k = _norm(_r["dwa_title"])
    if _k not in dwa_to_hier:
        dwa_to_hier[_k] = {
            "iwa": str(_r["iwa_title"]).strip(),
            "gwa": str(_r["gwa_title"]).strip(),
        }
print(f"DWA->IWA/GWA lookup: {len(dwa_to_hier):,} unique DWAs")

# --- Find overlapping MCPs (case-insensitive title match) ---
_teddy_lower = {str(t).strip().lower(): str(t).strip() for t in teddy_occ["title"]}
_pipe_lower  = {str(t).strip().lower(): str(t).strip() for t in mcp_results["title"]}
_overlap     = set(_teddy_lower.keys()) & set(_pipe_lower.keys())

print(f"\nOverlapping MCPs (in both Teddy labels and pipeline results): {len(_overlap)}")
for _tl in sorted(_overlap):
    print(f"  {_pipe_lower[_tl]}")

Pipeline results: 10,140 MCPs  (mcp_results_2026-02-18.csv)
Tasks hierarchy:  23,851 rows  (2,083 unique DWAs)
Teddy labeled MCPs (occ_relevant=yes): 28  (28 unique MCPs)
DWA->IWA/GWA lookup: 2,083 unique DWAs

Overlapping MCPs (in both Teddy labels and pipeline results): 26
  Alist MCP Server
  Amadeus MCP Server
  Claude MCP: Enhance Your Experience with Claude.ai
  comfy-ui-mcp-server MCP server
  CyberChef API MCP Server
  DART-mcp-server
  DiceDB MCP
  Filament MCP Server - Laravel Loop
  Gitee MCP Server
  Hubble AI for Solana
  iRacing
  MCP Chat Adapter
  MCP Graph API Integration
  MCP Lab
  mcp-fastify-server
  mcp-server-rubygems
  MCP-testing: OpenWebUI + mcpo Integration
  mcp_py_exam
  mentor-mcp-server
  mxHERO Mail2Cloud MCP
  Self-hosted AI starter kit (by the n8n team)
  SQL Server MCP Server
  Stability AI MCP Server
  SumoLogic MCP Server
  Vapi MCP for Cursor
  Web Content MCP


In [7]:
# ============================================================
# Section 1: DWA Overlap
# Pipeline dwas_selected / dwas_used_for_tasks vs Teddy DWAs
# ============================================================

dwa_rows = []
for tl in sorted(_overlap):
    t_row = teddy_occ[teddy_occ["title"].str.strip().str.lower() == tl].iloc[0]
    p_row = mcp_results[mcp_results["title"].str.strip().str.lower() == tl].iloc[0]

    teddy_dwas = parse_sc_set(t_row["dwa"])
    pipe_sel   = parse_sc_set(p_row.get("dwas_selected"))
    pipe_used  = parse_sc_set(p_row.get("dwas_used_for_tasks"))

    def _rj(a, b):
        if not a:
            return None, None
        inter = a & b
        return round(len(inter) / len(a), 3), round(len(inter) / len(a | b), 3)

    r_sel,  _ = _rj(teddy_dwas, pipe_sel)
    r_used, j_used = _rj(teddy_dwas, pipe_used)
    missing   = teddy_dwas - pipe_used

    dwa_rows.append({
        "mcp_title":         _pipe_lower[tl],
        "n_teddy_dwas":      len(teddy_dwas),
        "n_pipe_selected":   len(pipe_sel),
        "n_pipe_used":       len(pipe_used),
        "recall_selected":   r_sel,
        "recall_used":       r_used,
        "jaccard_used":      j_used,
        "missing_from_used": "; ".join(sorted(missing)) if missing else "(all found)",
    })

dwa_df = pd.DataFrame(dwa_rows)

print("DWA OVERLAP \u2014 Teddy Selections vs Pipeline")
print("=" * 70)
print(dwa_df[["mcp_title","n_teddy_dwas","n_pipe_selected","n_pipe_used",
              "recall_selected","recall_used","jaccard_used"]].to_string(index=False))

has_dwas = dwa_df["n_teddy_dwas"] > 0
if has_dwas.any():
    print()
    print(f"  Mean recall vs dwas_selected      : "
          f"{dwa_df.loc[has_dwas,'recall_selected'].mean():.1%}")
    print(f"  Mean recall vs dwas_used_for_tasks: "
          f"{dwa_df.loc[has_dwas,'recall_used'].mean():.1%}")
    print(f"  Mean Jaccard (used set)           : "
          f"{dwa_df.loc[has_dwas,'jaccard_used'].mean():.3f}")
    print()
    print("  Teddy DWAs NOT found in pipeline's dwas_used_for_tasks:")
    any_missing = False
    for _, r in dwa_df[has_dwas].iterrows():
        if r["missing_from_used"] != "(all found)":
            any_missing = True
            print(f"    [{r['mcp_title']}]")
            for m in r["missing_from_used"].split(";"):
                print(f"      - {m.strip()}")
    if not any_missing:
        print("    (none \u2014 pipeline recovered all Teddy DWAs)")

DWA OVERLAP — Teddy Selections vs Pipeline
                                         mcp_title  n_teddy_dwas  n_pipe_selected  n_pipe_used  recall_selected  recall_used  jaccard_used
                                  Alist MCP Server             0               15           13              NaN          NaN           NaN
                                Amadeus MCP Server             1                7            7            0.000        0.000         0.000
Claude MCP: Enhance Your Experience with Claude.ai             1               13            9            0.000        0.000         0.000
                    comfy-ui-mcp-server MCP server             1               15           10            1.000        0.000         0.000
                          CyberChef API MCP Server             1               15           11            0.000        0.000         0.000
                                   DART-mcp-server             2               15            8            0.000        0.00

In [8]:
# ============================================================
# Section 2: Task Context Coverage & Rating Agreement
# Teddy's tasks carry an implicit rating of 5.
# ============================================================

def parse_task_ratings(s):
    """Parse 'task text (occupation): N; ...' into dict {norm_task_text: rating}.

    Splits on '; ' first so that the separator is never included in the task text,
    then matches each piece with a full-line anchored regex.
    """
    if pd.isna(s) or not str(s).strip():
        return {}
    result = {}
    piece_pat = re.compile(r'^(.*?)\s+\(([^()]+)\):\s*([1-5])\s*$')
    for piece in re.split(r'; ', str(s).strip()):
        piece = piece.strip()
        m = piece_pat.match(piece)
        if m:
            result[norm_task(m.group(1))] = int(m.group(3))
    return result


task_rows = []
for tl in sorted(_overlap):
    t_row = teddy_occ[teddy_occ["title"].str.strip().str.lower() == tl].iloc[0]
    p_row = mcp_results[mcp_results["title"].str.strip().str.lower() == tl].iloc[0]
    orig  = _pipe_lower[tl]

    pipe_task_map = parse_task_ratings(p_row.get("task_ratings"))
    teddy_tasks   = split_cell(t_row["task"])   # semicolon-separated within cell

    for teddy_task in teddy_tasks:
        tnorm       = norm_task(teddy_task)
        in_ctx      = tnorm in pipe_task_map
        pipe_rating = pipe_task_map.get(tnorm)
        task_rows.append({
            "mcp_title":       orig,
            "teddy_task":      teddy_task,
            "in_context":      in_ctx,
            "pipeline_rating": pipe_rating,
            "teddy_rating":    5,
            "abs_error":       abs(5 - pipe_rating) if pipe_rating is not None else None,
        })

task_df  = pd.DataFrame(task_rows)
total_t  = len(task_df)
in_ctx_n = int(task_df["in_context"].sum()) if total_t > 0 else 0
matched  = task_df[task_df["in_context"]] if total_t > 0 else task_df.iloc[0:0]

print("TASK CONTEXT COVERAGE & RATING AGREEMENT")
print("=" * 70)
if total_t == 0:
    print("  No tasks assigned in overlapping Teddy rows.")
else:
    print(f"Teddy-assigned tasks total:          {total_t}")
    print(f"Found in pipeline task context:      {in_ctx_n} ({in_ctx_n / total_t:.1%})")

    if not matched.empty:
        print()
        print("Pipeline ratings for matched tasks (Teddy implicit = 5):")
        vc = matched["pipeline_rating"].value_counts().sort_index()
        for rating, count in vc.items():
            bar = "\u2588" * int(count)
            print(f"  Rating {rating}: {bar}  ({count})")
        print()
        print(f"  Mean pipeline rating:           {matched['pipeline_rating'].mean():.2f}")
        print(f"  Median pipeline rating:         {matched['pipeline_rating'].median():.1f}")
        print(f"  Mean absolute error vs 5:       {matched['abs_error'].mean():.2f}")
        print(f"  Rated >= 4 (pipeline agrees):   "
              f"{(matched['pipeline_rating'] >= 4).sum()} / {len(matched)} "
              f"({(matched['pipeline_rating'] >= 4).mean():.1%})")
        print(f"  Rated <= 2 (pipeline disagrees):"
              f"{(matched['pipeline_rating'] <= 2).sum()} / {len(matched)} "
              f"({(matched['pipeline_rating'] <= 2).mean():.1%})")

    missing_tasks = task_df[~task_df["in_context"]]
    if not missing_tasks.empty:
        print()
        print(f"Tasks NOT found in pipeline context ({len(missing_tasks)}):")
        for _, r in missing_tasks.iterrows():
            print(f"  [{r['mcp_title']}]  {r['teddy_task'][:100]}")

    print()
    print(task_df[["mcp_title","teddy_task","in_context",
                   "pipeline_rating","abs_error"]].to_string(index=False))


TASK CONTEXT COVERAGE & RATING AGREEMENT
Teddy-assigned tasks total:          71
Found in pipeline task context:      45 (63.4%)

Pipeline ratings for matched tasks (Teddy implicit = 5):
  Rating 1.0: ██  (2)
  Rating 2.0: ██████████  (10)
  Rating 3.0: █████████  (9)
  Rating 4.0: ██████████████████  (18)
  Rating 5.0: ██████  (6)

  Mean pipeline rating:           3.36
  Median pipeline rating:         4.0
  Mean absolute error vs 5:       1.64
  Rated >= 4 (pipeline agrees):   24 / 45 (53.3%)
  Rated <= 2 (pipeline disagrees):12 / 45 (26.7%)

Tasks NOT found in pipeline context (26):
  [Amadeus MCP Server]  Compile information about flights from flight plans, pilot reports, radar, or observations.
  [Claude MCP: Enhance Your Experience with Claude.ai]  Configure servers to meet functional specifications.
  [Claude MCP: Enhance Your Experience with Claude.ai]  Coordinate and link the computer systems within an organization to increase compatibility so that in
  [comfy-ui-mcp-server M

In [9]:
# ============================================================
# Section 3: GWA / IWA Alignment
# Map pipeline DWAs -> GWA/IWA via tasks_dwa_iwa_gwa.csv,
# then compare against Teddy's GWA/IWA selections.
# ============================================================

def _rj_sets(a, b):
    """Recall and Jaccard for two sets; returns (recall, jaccard) or (None, None)."""
    if not a:
        return None, None
    inter = a & b
    return round(len(inter) / len(a), 3), round(len(inter) / len(a | b), 3) if (a | b) else None


hier_rows = []
for tl in sorted(_overlap):
    t_row = teddy_occ[teddy_occ["title"].str.strip().str.lower() == tl].iloc[0]
    p_row = mcp_results[mcp_results["title"].str.strip().str.lower() == tl].iloc[0]
    orig  = _pipe_lower[tl]

    # Teddy's GWAs and IWAs (semicolon-separated within a single cell)
    teddy_gwas = set(split_cell(t_row["gwa"]))
    teddy_iwas = set(split_cell(t_row["iwa"]))

    # Map pipeline DWAs -> GWA/IWA via the O*NET hierarchy lookup
    pipe_dwas = parse_sc_set(p_row.get("dwas_used_for_tasks"))
    pipe_gwas, pipe_iwas, unresolved = set(), set(), 0
    for dwa in pipe_dwas:
        entry = dwa_to_hier.get(dwa)
        if entry:
            pipe_gwas.add(entry["gwa"])
            pipe_iwas.add(entry["iwa"])
        else:
            unresolved += 1

    gwa_rec, gwa_jac = _rj_sets(teddy_gwas, pipe_gwas)
    iwa_rec, iwa_jac = _rj_sets(teddy_iwas, pipe_iwas)

    hier_rows.append({
        "mcp_title":       orig,
        "teddy_gwas":      " | ".join(sorted(teddy_gwas)),
        "pipe_n_gwas":     len(pipe_gwas),
        "gwa_recall":      gwa_rec,
        "gwa_jaccard":     gwa_jac,
        "gwa_overlap":     " | ".join(sorted(teddy_gwas & pipe_gwas)) or "(none)",
        "teddy_iwas":      " | ".join(sorted(teddy_iwas)),
        "pipe_n_iwas":     len(pipe_iwas),
        "iwa_recall":      iwa_rec,
        "iwa_jaccard":     iwa_jac,
        "iwa_overlap":     " | ".join(sorted(teddy_iwas & pipe_iwas)) or "(none)",
        "unresolved_dwas": unresolved,
    })

hier_df = pd.DataFrame(hier_rows)

print("GWA / IWA ALIGNMENT \u2014 Pipeline DWA Hierarchy vs Teddy")
print("=" * 70)

print("\n--- GWA (General Work Activity) ---")
print(hier_df[["mcp_title","teddy_gwas","pipe_n_gwas",
               "gwa_recall","gwa_jaccard","gwa_overlap"]].to_string(index=False))
gwa_recs = hier_df["gwa_recall"].dropna()
if not gwa_recs.empty:
    print(f"\n  Mean GWA recall :  {gwa_recs.mean():.1%}")
    print(f"  Mean GWA Jaccard:  {hier_df['gwa_jaccard'].dropna().mean():.3f}")

print("\n--- IWA (Intermediate Work Activity) ---")
print(hier_df[["mcp_title","teddy_iwas","pipe_n_iwas",
               "iwa_recall","iwa_jaccard","iwa_overlap"]].to_string(index=False))
iwa_recs = hier_df["iwa_recall"].dropna()
if not iwa_recs.empty:
    print(f"\n  Mean IWA recall :  {iwa_recs.mean():.1%}")
    print(f"  Mean IWA Jaccard:  {hier_df['iwa_jaccard'].dropna().mean():.3f}")

total_unres = hier_df["unresolved_dwas"].sum()
if total_unres > 0:
    total_pipe = sum(
        len(parse_sc_set(
            mcp_results[mcp_results["title"].str.strip().str.lower() == tl]
            .iloc[0].get("dwas_used_for_tasks")
        ))
        for tl in _overlap
    )
    print(f"\n  Note: {total_unres} of {total_pipe} pipeline DWAs "
          "could not be resolved in the O*NET hierarchy lookup.")

GWA / IWA ALIGNMENT — Pipeline DWA Hierarchy vs Teddy

--- GWA (General Work Activity) ---
                                         mcp_title                                                                                                                                                                                                                                                                                                                                                                      teddy_gwas  pipe_n_gwas  gwa_recall  gwa_jaccard                                                                               gwa_overlap
                                  Alist MCP Server                                                                                                                                                                                                                                                                                                                        

In [10]:
# ============================================================
# Section 4: Overall Summary
# ============================================================

print("=" * 70)
print("HUMAN VALIDATION \u2014 SUMMARY")
print("=" * 70)
print(f"\nSample: {len(_overlap)} MCPs in common  "
      f"(Teddy labeled: {teddy_occ['title'].nunique()} | "
      f"Pipeline total: {len(mcp_results):,})")

has_dwas = dwa_df["n_teddy_dwas"] > 0
if has_dwas.any():
    print()
    print("DWA Recovery (fraction of Teddy's DWAs found in pipeline output):")
    print(f"  vs dwas_selected (LLM-chosen, up to 15):   "
          f"{dwa_df.loc[has_dwas,'recall_selected'].mean():.1%}")
    print(f"  vs dwas_used_for_tasks (task rating input): "
          f"{dwa_df.loc[has_dwas,'recall_used'].mean():.1%}")
    print(f"  Jaccard similarity (used set):              "
          f"{dwa_df.loc[has_dwas,'jaccard_used'].mean():.3f}")

if total_t > 0:
    print()
    print("Task Context Coverage:")
    print(f"  Teddy tasks found in pipeline context: "
          f"{in_ctx_n} / {total_t} = {in_ctx_n / total_t:.1%}")

if not matched.empty:
    print()
    print("Rating Agreement (matched tasks; Teddy implicit = 5):")
    print(f"  Mean pipeline rating:  {matched['pipeline_rating'].mean():.2f}  "
          f"(MAE: {matched['abs_error'].mean():.2f})")
    print(f"  Pipeline rated >= 4:   {(matched['pipeline_rating'] >= 4).mean():.1%}")
    print(f"  Pipeline rated <= 2:   {(matched['pipeline_rating'] <= 2).mean():.1%}")

gwa_recs = hier_df["gwa_recall"].dropna()
if not gwa_recs.empty:
    print()
    print("GWA Alignment (broad work-activity categories):")
    print(f"  Mean recall :  {gwa_recs.mean():.1%}")
    print(f"  Mean Jaccard:  {hier_df['gwa_jaccard'].dropna().mean():.3f}")

iwa_recs = hier_df["iwa_recall"].dropna()
if not iwa_recs.empty:
    print()
    print("IWA Alignment (intermediate work activities):")
    print(f"  Mean recall :  {iwa_recs.mean():.1%}")
    print(f"  Mean Jaccard:  {hier_df['iwa_jaccard'].dropna().mean():.3f}")

print()
print("Key:")
print("  Recall  = fraction of Teddy's labels recovered by the pipeline (coverage of ground truth)")
print("  Jaccard = |intersection| / |union| (penalises over-selection)")
print("  MAE     = mean absolute error in 1\u20135 rating (pipeline vs Teddy's implicit 5)")

HUMAN VALIDATION — SUMMARY

Sample: 26 MCPs in common  (Teddy labeled: 28 | Pipeline total: 10,140)

DWA Recovery (fraction of Teddy's DWAs found in pipeline output):
  vs dwas_selected (LLM-chosen, up to 15):   36.9%
  vs dwas_used_for_tasks (task rating input): 29.9%
  Jaccard similarity (used set):              0.064

Task Context Coverage:
  Teddy tasks found in pipeline context: 45 / 71 = 63.4%

Rating Agreement (matched tasks; Teddy implicit = 5):
  Mean pipeline rating:  3.36  (MAE: 1.64)
  Pipeline rated >= 4:   53.3%
  Pipeline rated <= 2:   26.7%

GWA Alignment (broad work-activity categories):
  Mean recall :  50.1%
  Mean Jaccard:  0.140

IWA Alignment (intermediate work activities):
  Mean recall :  32.5%
  Mean Jaccard:  0.077

Key:
  Recall  = fraction of Teddy's labels recovered by the pipeline (coverage of ground truth)
  Jaccard = |intersection| / |union| (penalises over-selection)
  MAE     = mean absolute error in 1–5 rating (pipeline vs Teddy's implicit 5)
