In [9]:
import json
import math
from collections import Counter

# Path to your MoE log file
LOG_PATH = "moe_layer0.jsonl"  # change to moe_layer12.jsonl or moe_routes.jsonl if needed
meta = None
routes = []

with open(LOG_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rec = json.loads(line)
        if rec.get("type") == "meta":
            meta = rec
        elif rec.get("type") == "route":
            routes.append(rec)

print(f"Loaded {len(routes)} route records.")
print("Meta header:")
print(meta)

Loaded 12805 route records.
Meta header:
{'type': 'meta', 'model_id': 'Qwen/Qwen1.5-MoE-A2.7B-Chat', 'vllm_version': '0.11.2.dev344+g35657bcd7', 'torch_version': '2.9.0+cu128', 'device': 'cuda', 'seed': 1234, 'layers_logged': [0], 'top_k': 4, 'created_at': 1764302556.0129569}


In [5]:
expert_counts = Counter()

for rec in routes:
    # rec["topk_ids"] is a list of expert IDs used for this token
    for expert_id in rec["topk_ids"]:
        expert_counts[expert_id] += 1

print(f"Number of unique experts seen: {len(expert_counts)}")
print("First 10 (expert_id -> count):")
for eid, cnt in list(expert_counts.items())[:10]:
    print(eid, "->", cnt)

Number of unique experts seen: 60
First 10 (expert_id -> count):
43 -> 8744
5 -> 8724
7 -> 8758
58 -> 8792
33 -> 98
16 -> 266
24 -> 332
27 -> 226
1 -> 336
49 -> 312


In [None]:
# Total number of expert selections (each token contributes top_k selections)
total_selections = sum(expert_counts.values())

# Sorted list of (expert_id, count), highest first
sorted_experts = sorted(expert_counts.items(), key=lambda x: x[1], reverse=True)

top3 = sorted_experts[:3]
print("Top-3 experts (by count):")
for eid, cnt in top3:
    frac = cnt / total_selections
    print(f"  Expert {eid}: count={cnt}, fraction={frac:.4f}")

# Normalized distribution: P(expert) = count / total_selections
expert_probs = {eid: cnt / total_selections for eid, cnt in expert_counts.items()}

# Sanity
print(f"\nSum of probabilities: {sum(expert_probs.values()):.6f}")


Top-3 experts (by count):
  Expert 58: count=8792, fraction=0.1717
  Expert 7: count=8758, fraction=0.1710
  Expert 43: count=8744, fraction=0.1707

Sum of probabilities: 1.000000


In [10]:
def entropy_bits(prob_dict):
    H = 0.0
    for p in prob_dict.values():
        if p > 0:
            H -= p * math.log2(p)
    return H

H = entropy_bits(expert_probs)

num_experts = len(expert_probs)
max_entropy = math.log2(num_experts) if num_experts > 0 else float("nan")

print(f"Entropy of expert usage: {H:.4f} bits")
print(f"Maximum possible entropy for {num_experts} experts: {max_entropy:.4f} bits")

Entropy of expert usage: 4.0977 bits
Maximum possible entropy for 60 experts: 5.9069 bits
