In [3]:
import re
import json
from urllib.parse import urlparse
from collections import defaultdict

INPUT_FILE = "sessiontrack2014.xml"   # Update path as needed

# Regex patterns
SESSION_RE = re.compile(r'<session[^>]*num="(\d+)"[^>]*userid="(\d+)"')
TOPIC_RE = re.compile(r'<topic num="(\d+)"')
INTERACTION_START_RE = re.compile(r'<interaction[^>]*starttime="([\d\.]+)"')

RESULT_RANK_RE = re.compile(r'<result rank="(\d+)">')
URL_RE = re.compile(r"<url>(.*?)</url>")

CLICK_BLOCK_RE = re.compile(r'<click[^>]*num="(\d+)"[^>]*starttime="([\d\.]+)"[^>]*endtime="([\d\.]+)"')
CLICK_RANK_RE = re.compile(r"<rank>(\d+)</rank>")


def extract_domain(url):
    """Extract domain from a URL."""
    try:
        return urlparse(url).netloc.lower()
    except:
        return None


def parse_trec_sessions(path, min_clicks=2):
    """
    Parses TREC Session Track data.
    
    Returns structure:
    {
        "user_3": {
            "topic_10": [
                {"timestamp": 149.41, "domain": "...", "rank": 2},
                ...
            ],
            ...
        },
        ...
    }
    """
    data = defaultdict(lambda: defaultdict(list))

    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    current_user = None
    current_topic = None

    inside_results = False
    results_buffer = {}  # rank → URL

    click_start_time = None  # timestamp per click

    for i, line in enumerate(lines):

        # --- Parse session start ---
        m = SESSION_RE.search(line)
        if m:
            session_num = m.group(1)
            user_id = m.group(2)
            current_user = f"user_{user_id}"
            continue

        # --- Parse topic ---
        m = TOPIC_RE.search(line)
        if m:
            current_topic = f"topic_{m.group(1)}"
            continue

        # --- Interaction start (resets results buffer) ---
        m = INTERACTION_START_RE.search(line)
        if m:
            inside_results = False
            results_buffer = {}
            click_start_time = None
            continue

        # --- Enter results block ---
        if "<results>" in line:
            inside_results = True
            continue

        # --- Leave results block ---
        if "</results>" in line:
            inside_results = False
            continue

        # --- Inside results block: store rank → URL ---
        if inside_results:
            rank_match = RESULT_RANK_RE.search(line)
            if rank_match:
                rank = int(rank_match.group(1))
                # Next line should contain the <url>
                url_match = URL_RE.search(lines[i + 1])
                if url_match:
                    results_buffer[rank] = url_match.group(1)
            continue

        # --- Click block: get click timestamp ---
        m = CLICK_BLOCK_RE.search(line)
        if m:
            click_start_time = float(m.group(2))
            continue

        # --- Rank inside a click block ---
        rank_match = CLICK_RANK_RE.search(line)
        if rank_match and click_start_time is not None:
            clicked_rank = int(rank_match.group(1))
            clicked_url = results_buffer.get(clicked_rank)

            if clicked_url:
                domain = extract_domain(clicked_url)
                if domain:
                    data[current_user][current_topic].append({
                        "timestamp": click_start_time,
                        "domain": domain,
                        "rank": clicked_rank
                    })

            # Reset click timestamp so next clicks don't reuse it incorrectly
            click_start_time = None

    # --- Filter out sessions with fewer than min_clicks ---
    filtered = defaultdict(dict)
    for user, topics in data.items():
        for topic, clicks in topics.items():
            if len(clicks) >= min_clicks:
                filtered[user][topic] = clicks

    return filtered


# -----------------------------
# Run Parser
# -----------------------------
parsed = parse_trec_sessions(INPUT_FILE, min_clicks=2)

with open("trec_human_parsed.json", "w") as f:
    json.dump(parsed, f, indent=2)

print("Done.")
print("Users:", len(parsed))
print("Total retained sessions:",
      sum(len(t) for t in parsed.values()))


Done.
Users: 75
Total retained sessions: 367


In [59]:
import json
import glob
import re
from pathlib import Path
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# -------------------------------------------------------------------
# Paths
# -------------------------------------------------------------------
HUMAN_JSON = Path("trec_human_parsed.json")   # adjust if needed
AGENT_GLOB = "autogen_log/gpt-4o/agent_traffic_*.txt"           # e.g., agent_traffic_10.txt

# -------------------------------------------------------------------
# Noise filtering rules for agent traffic
# -------------------------------------------------------------------

NOISY_KEYWORDS = [
    "doubleclick", "hubspot", "adspsp", "onetrust.", "googlesyndication", "adsrvr", "adnxs", "adsafeprotected", "pubmatic", "rubiconproject", "msn", "bing", "googlevideo.", "googletagmanager.", "adform", "salesforceliveagent",
    "mdn", "admaster", "criteo", "rtb", "yieldmo", "taboola", "outbrain", "quantserve", "insightexpress",
    "adservice", "amazon-adsystem", "aps.amazon", "adtrafficquality.google", "ads.", "advertising",
    "banner", "cookie", "ezodn", "ezoic", "adlightning", "liadm.com", "media.net", "pm-serv.co",
    "rqtrk.eu", "mgid", "adblocker", "admanmedia", "hotjar.", "clickguard", "contextweb", "doubleverify",
    "analytics", "google-analytics", "facebook", "twitter", "linkedin", "scorecardresearch", "pixel",
    "tracking", "track", "metrics.", "sentry", "clarity.ms", "fullstory", "crazyegg", "segment",
    "cdn", "fonts", "gstatic", "cloudfront", "cloudflare", "moatads", "optimizely", "js.", "api.",
    "virtualearth", "apis.", "wp.com", "hlx.page", "vo.msecnd.net", "mpio.io",
    "amazonaws.com", "objectstorage", "azureedge.net", "fastly.net", "cdn.jsdelivr.net", "akamaihd.net",
    "qualified", "recombee", "leadsrx", "crwdcntrl", "typekit.", 
    "imasdk", "translate.google", "calendar.google", "accounts.google", "googleapis", "cse.google.com",
    "fundingchoicesmessages.google.com", "google.", "bidsxchange", "avplayer", "usercontent.", "css.", "scripts.",
    "privacymanager.io", "foresee.com", "qualaroo", "trustarc", "truste", "gatekeeperconsent",
    "givebutter", "churnkey", "fundraiseup", "mailerlite", "marketo", "pardot", "hsforms", "paperform",
    "activedemand", "activehosted", "convertkit", "script.", "googletag", "widget",
    "qualtrics", "monsido", "igodigital", "drivetheweb", "addtoany", "liveperson.", 
    "squarespace", "cloudinary", "wistia", "forbesimg", "imageio.forbes.com", "ytimg", "open.video",
    "humix", "recommendation.forbes.com", "analytics.", "google.", "tiktok", "server.", 
    "amazonaws.", "api-domain-compado.com", "xapstream", "adroll", "-app", "3lift.", "beacon",
    "pinterest", "pinimg.com", "quora", "instagram", "yimg.com", "recaptcha", "sharethis", "highperformanceformat.com", "fontawesome.", ".content", "ad.gt", "assets-", 
    "jwplayer", "assets.", "jquery.", "static.", "asset-", "app-", "app.", "js-", "widget.", "captcha", "permutive.app", "youtube.", "lytics", "adthrive.", "wikimedia", "doubleverify",
]


NOISY_EXTENSIONS = [
    ".js", ".css", ".png", ".jpg", ".jpeg", ".gif", ".svg",
    ".ico", ".woff", ".woff2", ".ttf", ".map", ".json"
]

def is_noisy_agent_request(domain, path):
    d = domain.lower()

    if any(k in d for k in NOISY_KEYWORDS):
        return True

    if any(path.lower().endswith(ext) for ext in NOISY_EXTENSIONS):
        return True

    if "favicon" in path.lower():
        return True

    # assets / static folders
    if "/assets/" in path.lower() or "/static/" in path.lower():
        return True

    return False

# -------------------------------------------------------------------
# 1) Helper: session-level features
#    events: list of dicts with keys:
#       human: {"timestamp": float, "domain": str}
#       agent: {"timestamp": float, "domain": str} after conversion
# -------------------------------------------------------------------
def session_features_from_events(events, time_key="timestamp"):
    if not events:
        return {
            "n_events": 0,
            "n_domains": 0,
            "domain_entropy": 0.0,
            "duration_s": 0.0,
            "mean_gap_s": 0.0,
            "std_gap_s": 0.0,
        }

    # Times
    times = sorted(e[time_key] for e in events)
    duration = float(times[-1] - times[0])

    gaps = [t2 - t1 for t1, t2 in zip(times, times[1:])]
    if gaps:
        mean_gap = float(np.mean(gaps))
        std_gap = float(np.std(gaps, ddof=0))
    else:
        mean_gap = 0.0
        std_gap = 0.0

    # Domains (may be full host or host+path – that’s fine for our purpose)
    domains = [e["domain"] for e in events]
    counts = Counter(domains)
    n_domains = len(counts)

    if counts:
        probs = np.array(list(counts.values()), dtype=float)
        probs /= probs.sum()
        domain_entropy = float(-(probs * np.log2(probs)).sum())
    else:
        domain_entropy = 0.0

    return {
        "n_events": len(events),
        "n_domains": n_domains,
        "domain_entropy": domain_entropy,
        "duration_s": duration,
        "mean_gap_s": mean_gap,
        "std_gap_s": std_gap,
    }

# -------------------------------------------------------------------
# 2) Parse agent logs with noise filtering
# -------------------------------------------------------------------
def parse_clean_agent_log(path: str):
    text = Path(path).read_text(encoding="utf-8", errors="ignore")
    events = []

    for line in text.splitlines():
        if "[ISP]" not in line or "Domain:" not in line:
            continue

        # -------------------------------
        # Extract timestamp
        # -------------------------------
        try:
            after_tag = line.split("[ISP]", 1)[1].strip()
            time_str = after_tag.split("|", 1)[0].strip()
            t = datetime.fromisoformat(time_str)
        except:
            continue

        # -------------------------------
        # Extract domain + path
        # -------------------------------
        dom_section = line.split("Domain:", 1)[1].strip()
        dom_section = dom_section.split("|", 1)[0].strip()
        if not dom_section:
            continue

        domain = dom_section.split()[0]
        path = dom_section  # may include path after domain

        # -------------------------------
        # Extract packet size
        # -------------------------------
        m = re.search(r"Size:\s*(\d+|unknown)", line)
        if m:
            size_str = m.group(1)
            size = int(size_str) if size_str.isdigit() else None
        else:
            size = None

        # -------------------------------
        # Filtering conditions
        # -------------------------------

        # Very small packets = favicon/cookie/tracker/etc.
        if size is None or size < 7000:
            continue

        # Domain/path-based noise
        if is_noisy_agent_request(domain, path):
            continue

        # -------------------------------
        # Add event
        # -------------------------------
        events.append({"timestamp": t, "domain": domain})

    if not events:
        return []

    # Convert absolute timestamp → relative seconds from session start
    t0 = min(e["timestamp"] for e in events)
    for e in events:
        e["timestamp"] = (e["timestamp"] - t0).total_seconds()

    return events


# -------------------------------------------------------------------
# 3) Load human sessions from trec_human_parsed.json
#    Structure: { "user_3": { "topic_10": [ {timestamp, domain, rank}, ...], ...}, ... }
# -------------------------------------------------------------------
human_raw = json.loads(HUMAN_JSON.read_text())

rows = []

for user_id, topics in human_raw.items():
    for topic_key, events in topics.items():
        m = re.match(r"topic_(\d+)", topic_key)
        if not m:
            continue
        topic = int(m.group(1))

        # Human events are already relative seconds
        feats = session_features_from_events(events, time_key="timestamp")
        rows.append(
            {
                "source": "human",
                "user_id": user_id,   # e.g., "user_3"
                "topic": topic,
                **feats,
            }
        )

# -------------------------------------------------------------------
# 4) Load agent sessions from agent_traffic_{topic}.txt
#    One session per topic file
# -------------------------------------------------------------------
for path in glob.glob(AGENT_GLOB):
    m = re.search(r"agent_traffic_(\d+)\.txt", path)
    if not m:
        continue
    topic = int(m.group(1))

    agent_events = parse_clean_agent_log(path)
    if not agent_events:
        continue

    feats = session_features_from_events(agent_events, time_key="timestamp")
    rows.append(
        {
            "source": "agent",
            "user_id": f"agent_{topic}",
            "topic": topic,
            **feats,
        }
    )

# -------------------------------------------------------------------
# 5) Final session-level dataframe
# -------------------------------------------------------------------
df_sessions = pd.DataFrame(rows)

print("Session-level dataframe shape:", df_sessions.shape)
print(df_sessions.head())
print(df_sessions["source"].value_counts())


Session-level dataframe shape: (427, 9)
  source user_id  topic  n_events  n_domains  domain_entropy  duration_s  \
0  human  user_2     56         4          4        2.000000     93.9027   
1  human  user_2     58         3          3        1.584963    124.1269   
2  human  user_2     26         5          5        2.321928    107.7516   
3  human  user_3     10        12         12        3.584963    136.7327   
4  human  user_3      5         6          3        1.459148    101.2957   

   mean_gap_s  std_gap_s  
0   31.300900   6.272443  
1   62.063450  44.373750  
2   26.937900  23.625530  
3   12.430245   7.927795  
4   20.259140  24.426841  
source
human    367
agent     60
Name: count, dtype: int64


In [60]:
df_sessions

Unnamed: 0,source,user_id,topic,n_events,n_domains,domain_entropy,duration_s,mean_gap_s,std_gap_s
0,human,user_2,56,4,4,2.000000,93.902700,31.300900,6.272443
1,human,user_2,58,3,3,1.584963,124.126900,62.063450,44.373750
2,human,user_2,26,5,5,2.321928,107.751600,26.937900,23.625530
3,human,user_3,10,12,12,3.584963,136.732700,12.430245,7.927795
4,human,user_3,5,6,3,1.459148,101.295700,20.259140,24.426841
...,...,...,...,...,...,...,...,...,...
422,agent,agent_6,6,73,5,1.619393,510.302326,7.087532,27.940488
423,agent,agent_60,60,571,16,2.607466,819.793586,1.438234,7.387941
424,agent,agent_7,7,91,17,3.030924,904.165151,10.046279,30.845047
425,agent,agent_8,8,38,4,1.336724,81.827232,2.211547,11.886999


In [72]:
FEATURE_COLS = [
    # "n_events",
    "n_domains",
    "domain_entropy",
    "duration_s",
    "mean_gap_s",
    "std_gap_s",
]


In [73]:

def cluster_per_topic(df):
    all_topic_results = []

    for topic, sub in df.groupby("topic"):
        if sub["source"].nunique() < 2:
            # No agent log or only one class → skip
            continue

        X = sub[FEATURE_COLS].values
        scaler = StandardScaler()
        Xs = scaler.fit_transform(X)

        km = KMeans(n_clusters=2, n_init=10, random_state=0)
        cluster_labels = km.fit_predict(Xs)

        # For *evaluation*, we can majority-map clusters to sources
        mapping = {}
        for c in [0, 1]:
            mask = cluster_labels == c
            majority = sub.loc[mask, "source"].mode()[0]
            mapping[c] = majority

        pred_source = [mapping[c] for c in cluster_labels]

        sub_out = sub.copy()
        sub_out["cluster_id"] = cluster_labels
        sub_out["cluster_source_majority"] = pred_source
        sub_out["cluster_correct"] = sub_out["cluster_source_majority"] == sub_out["source"]

        print(f"\n=== Topic {topic} ===")
        print(
            sub_out[["user_id", "source", "cluster_id", "cluster_source_majority"]]
            .sort_values("source")
        )
        print("Cluster purity:", sub_out["cluster_correct"].mean())

        all_topic_results.append(sub_out)

    if all_topic_results:
        return pd.concat(all_topic_results, ignore_index=True)
    else:
        return pd.DataFrame()


cluster_results = cluster_per_topic(df_sessions)




=== Topic 1 ===
     user_id source  cluster_id cluster_source_majority
367  agent_1  agent           0                   agent
25    user_4  human           1                   human
51    user_6  human           1                   human
73   user_14  human           1                   human
225  user_36  human           1                   human
237   user_9  human           0                   agent
305  user_67  human           1                   human
334  user_94  human           1                   human
Cluster purity: 0.875

=== Topic 2 ===
     user_id source  cluster_id cluster_source_majority
378  agent_2  agent           0                   human
78   user_15  human           0                   human
130  user_18  human           1                   human
163  user_16  human           0                   human
218  user_35  human           0                   human
221  user_36  human           0                   human
245  user_40  human           1                 

In [74]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def cluster_one_vs_one_per_topic(df, feature_cols, n_repeats=20):
    """
    For each topic:
      - take all human sessions for that topic
      - use the agent session for that topic
      - repeatedly sample 1 human session
      - run 2-cluster KMeans
      - record detailed clustering outputs
    """
    results = []

    for topic, sub in df.groupby("topic"):
        humans = sub[sub["source"] == "human"]
        agents = sub[sub["source"] == "agent"]

        if len(humans) == 0 or len(agents) == 0:
            continue  # skip if missing either side

        agent_row = agents.iloc[0]  
        agent_id = agent_row["user_id"]

        for r in range(n_repeats):
            # ----------------------------------------
            # Randomly pick exactly one human session
            # ----------------------------------------
            human_row = humans.sample(1, random_state=r).iloc[0]
            human_id = human_row["user_id"]

            pair_df = pd.DataFrame([human_row, agent_row])
            X = pair_df[feature_cols].values

            # ----------------------------------------
            # Standardize and cluster
            # ----------------------------------------
            scaler = StandardScaler()
            Xs = scaler.fit_transform(X)

            km = KMeans(n_clusters=2, n_init=10, random_state=r)
            labels = km.fit_predict(Xs)

            # ----------------------------------------
            # Check separability
            # ----------------------------------------
            human_label = labels[0]
            agent_label = labels[1]
            perfect_sep = (human_label != agent_label)

            # ----------------------------------------
            # Store verbose result
            # ----------------------------------------
            results.append({
                "topic": topic,
                "repeat": r,
                "human_user_id": human_id,
                "agent_user_id": agent_id,
                "human_label": int(human_label),
                "agent_label": int(agent_label),
                "perfect_sep": perfect_sep
            })

    return pd.DataFrame(results)


In [75]:
pair = cluster_one_vs_one_per_topic(df_sessions, FEATURE_COLS, n_repeats=20)

pair

Unnamed: 0,topic,repeat,human_user_id,agent_user_id,human_label,agent_label,perfect_sep
0,1,0,user_94,agent_1,1,0,True
1,1,1,user_94,agent_1,0,1,True
2,1,2,user_9,agent_1,0,1,True
3,1,3,user_9,agent_1,1,0,True
4,1,4,user_9,agent_1,1,0,True
...,...,...,...,...,...,...,...
1195,60,15,user_16,agent_60,1,0,True
1196,60,16,user_4,agent_60,0,1,True
1197,60,17,user_18,agent_60,0,1,True
1198,60,18,user_132,agent_60,1,0,True


In [76]:
def cluster_global(df, feature_cols):
    """
    Global unsupervised clustering:
      - mix all human + agent sessions
      - cluster into k=2
      - compute global purity
    """
    X = df[feature_cols].values
    y = df["source"].values

    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)

    km = KMeans(n_clusters=2, n_init=10, random_state=0)
    clusters = km.fit_predict(Xs)

    # Majority-map clusters → sources
    mapping = {}
    for c in [0, 1]:
        mask = clusters == c
        majority = df.loc[mask, "source"].mode()[0]
        mapping[c] = majority

    pred = [mapping[c] for c in clusters]
    purity = (pred == y).mean()

    out = df.copy()
    out["cluster"] = clusters
    out["cluster_source_majority"] = pred
    out["cluster_correct"] = (pred == y)

    return out, purity


def classify_global(df, feature_cols):
    """
    Global supervised classification (Logistic Regression, 5-fold CV)
    """
    X = df[feature_cols].values
    y = (df["source"] == "agent").astype(int).values  # binary: 1=agent, 0=human

    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)

    clf = LogisticRegression(max_iter=1000)
    acc = cross_val_score(clf, Xs, y, cv=5).mean()

    return acc


In [77]:
# Unsupervised global attribution
global_df, global_purity = cluster_global(df_sessions, FEATURE_COLS)
print("Global cluster purity:", global_purity)

# Supervised global attribution
global_acc = classify_global(df_sessions, FEATURE_COLS)
print("Global classification accuracy:", global_acc)


Global cluster purity: 0.8594847775175644
Global classification accuracy: 0.9789329685362518


In [78]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


def cluster_global_mixture(df, feature_cols,
                           n_humans=1, n_agents=3,
                           n_repeats=30,
                           random_state=0):
    """
    Global-level clustering experiment:
    - Sample n_humans human sessions (from ANY topic)
    - Sample n_agents agent sessions (from ANY topic)
    - Cluster all together (K=2)
    - Measure separability
    
    Returns detailed clustering logs.
    """

    rng = np.random.default_rng(random_state)

    humans = df[df["source"] == "human"]
    agents = df[df["source"] == "agent"]

    results = []

    for r in range(n_repeats):
        # -------------------------
        # Sample mixed sessions globally
        # -------------------------
        sampled_hum = humans.sample(n_humans, random_state=r)
        sampled_agents = agents.sample(n_agents, random_state=r)

        mix_df = pd.concat([sampled_hum, sampled_agents], ignore_index=True)

        X = mix_df[feature_cols].values
        scaler = StandardScaler()
        Xs = scaler.fit_transform(X)

        # -------------------------
        # Clustering
        # -------------------------
        km = KMeans(n_clusters=2, n_init=20, random_state=r)
        labels = km.fit_predict(Xs)

        mix_df["cluster"] = labels

        # -------------------------
        # Evaluate cluster composition
        # -------------------------
        # Which cluster is "agent-majority"?
        majority_map = {}
        for c in [0, 1]:
            subset = mix_df[mix_df["cluster"] == c]
            if len(subset) == 0:
                majority_map[c] = None
            else:
                majority_map[c] = subset["source"].mode()[0]

        mix_df["cluster_majority"] = mix_df["cluster"].map(majority_map)
        mix_df["correct"] = mix_df["source"] == mix_df["cluster_majority"]

        # -------------------------
        # Store run-level metrics
        # -------------------------
        sep_score = mix_df["correct"].mean()

        results.append({
            "repeat": r,
            "n_humans": n_humans,
            "n_agents": n_agents,
            "separability": sep_score,
            "human_ids": list(sampled_hum["user_id"]),
            "agent_ids": list(sampled_agents["user_id"]),
            "cluster_assignments": labels.tolist()
        })

    return pd.DataFrame(results)


In [79]:
res_1v3 = cluster_global_mixture(
    df_sessions, FEATURE_COLS,
    n_humans=1, n_agents=3,
    n_repeats=50
)


In [69]:
res_1v3['separability']

0     0.75
1     1.00
2     0.75
3     1.00
4     1.00
5     0.75
6     0.75
7     1.00
8     1.00
9     1.00
10    1.00
11    0.75
12    0.75
13    0.75
14    1.00
15    1.00
16    1.00
17    0.75
18    1.00
19    1.00
20    0.75
21    0.75
22    1.00
23    1.00
24    1.00
25    1.00
26    1.00
27    1.00
28    1.00
29    1.00
30    1.00
31    0.75
32    1.00
33    0.75
34    1.00
35    1.00
36    1.00
37    1.00
38    1.00
39    0.75
40    0.75
41    1.00
42    1.00
43    1.00
44    0.75
45    0.75
46    0.75
47    1.00
48    0.75
49    1.00
Name: separability, dtype: float64

In [70]:
import pandas as pd
import numpy as np

# ------------------------------------------------------
# Run many human/agent combinations and summarize them
# ------------------------------------------------------

def run_separability_grid(df, feature_cols,
                          human_sizes=[1,2,3],
                          agent_sizes=[1,3,5,10],
                          n_repeats=50):

    all_rows = []

    for h in human_sizes:
        for a in agent_sizes:
            results = cluster_global_mixture(
                df, feature_cols,
                n_humans=h,
                n_agents=a,
                n_repeats=n_repeats
            )

            mean_sep = results["separability"].mean()
            std_sep  = results["separability"].std()
            min_sep  = results["separability"].min()
            max_sep  = results["separability"].max()

            all_rows.append({
                "H (humans)": h,
                "A (agents)": a,
                "mean_separability": mean_sep,
                "std": std_sep,
                "min": min_sep,
                "max": max_sep
            })

    return pd.DataFrame(all_rows)


In [71]:
summary_df = run_separability_grid(
    df_sessions,
    FEATURE_COLS,
    human_sizes=[1,2,3,5],
    agent_sizes=[1,3,5,10],
    n_repeats=50
)

print(summary_df)


    H (humans)  A (agents)  mean_separability       std       min  max
0            1           1           1.000000  0.000000  1.000000  1.0
1            1           3           0.910000  0.121218  0.750000  1.0
2            1           5           0.896667  0.081719  0.833333  1.0
3            1          10           0.934545  0.041232  0.909091  1.0
4            2           1           0.873333  0.163438  0.666667  1.0
5            2           3           0.872000  0.126233  0.600000  1.0
6            2           5           0.845714  0.114867  0.714286  1.0
7            2          10           0.895000  0.064791  0.833333  1.0
8            3           1           0.840000  0.121218  0.750000  1.0
9            3           3           0.816667  0.147734  0.500000  1.0
10           3           5           0.795000  0.120479  0.625000  1.0
11           3          10           0.856923  0.082249  0.769231  1.0
12           5           1           0.880000  0.075593  0.833333  1.0
13    