In [1]:
# 正负样本氨基酸相关性分析
from collections import Counter
import numpy as np


def read_fasta(filepath):
    seqs = []
    with open(filepath, 'r') as f:
        seq = ''
        for line in f:
            if line.startswith('>'):
                if seq:
                    seqs.append(seq)
                    seq = ''
            else:
                seq += line.strip()
        if seq:
            seqs.append(seq)
    return seqs


# 合并正负样本
pos_train = read_fasta('/exp_data/sjx/star/first_data/shisuandanbai/positive_train.fasta')
pos_test = read_fasta('/exp_data/sjx/star/first_data/shisuandanbai/positive_test.fasta')
neg_train = read_fasta('/exp_data/sjx/star/first_data/shisuandanbai/negative_train.fasta')
neg_test = read_fasta('/exp_data/sjx/star/first_data/shisuandanbai/negative_test.fasta')

pos_seqs = pos_train + pos_test
neg_seqs = neg_train + neg_test

# 统计全局频率
amino_acids = list('ACDEFGHIKLMNPQRSTVWY')


def get_freq(seqs):
    total = Counter()
    total_len = 0
    for seq in seqs:
        total.update(seq)
        total_len += len(seq)
    freq = np.array([total[aa] for aa in amino_acids], dtype=float) / total_len
    return freq


pos_freq = get_freq(pos_seqs)
neg_freq = get_freq(neg_seqs)
print(pos_freq)
print(neg_freq)

In [2]:
lengths = [len(seq) for seq in pos_seqs]
print(f"正样本最大长度: {max(lengths)}, 最小长度: {min(lengths)}, 平均长度: {np.mean(lengths):.2f}")
amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
for i, seq in enumerate(pos_seqs):
    for aa in seq:
        if aa not in amino_acids:
            print(f"第{i}条序列有异常字符: {aa}")

In [3]:
amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
def filter_valid_sequences(seqs):
    return [seq for seq in seqs if all(aa in amino_acids for aa in seq)]
pos_seqs_clean = filter_valid_sequences(pos_seqs)
neg_seqs_clean = filter_valid_sequences(neg_seqs)
print(f"过滤后正样本数: {len(pos_seqs_clean)}")
print(f"过滤后负样本数: {len(neg_seqs_clean)}")

In [4]:
def filter_by_length(seqs, max_len=1000):
    return [seq for seq in seqs if len(seq) <= max_len]
pos_seqs_final = filter_by_length(pos_seqs_clean, max_len=1000)
neg_seqs_final = filter_by_length(neg_seqs_clean, max_len=1000)
print(f'最终正样本数: {len(pos_seqs_final)}')
print(f'最终负样本数: {len(neg_seqs_final)}')

In [5]:
aa_list = list("ACDEFGHIKLMNPQRSTVWY")
num_aa = len(aa_list)
aa2idx = {aa: i for i, aa in enumerate(aa_list)}
aa_freq_matrix = np.zeros((num_aa, len(pos_seqs_final)))
for i, seq_str in enumerate(pos_seqs_final):
    if not seq_str:
        continue
    int_seq = np.array([aa2idx.get(aa, -1) for aa in seq_str], dtype=int)
    valid_indices = int_seq[int_seq != -1]
    total_len = len(valid_indices)
    if total_len == 0:
        continue
    counts = np.bincount(valid_indices, minlength=num_aa)
    aa_freq_matrix[:, i] = counts / total_len

print("频率矩阵 shape:", aa_freq_matrix.shape)
print("全0行数：", np.sum(aa_freq_matrix.sum(axis=1) == 0))
print("全0列数：", np.sum(aa_freq_matrix.sum(axis=0) == 0))
print("nan数：", np.isnan(aa_freq_matrix).sum())
print("inf数：", np.isinf(aa_freq_matrix).sum())
print("每行最小值：", aa_freq_matrix.min(axis=1))
print("每行最大值：", aa_freq_matrix.max(axis=1))
print("每列最小值：", aa_freq_matrix.min(axis=0)[:10], "...")
print("每列最大值：", aa_freq_matrix.max(axis=0)[:10], "...")

In [6]:

import numpy as np
from collections import Counter


import numpy as np

from tqdm import tqdm

def get_aa_correlation_matrix_optimized(seqs, amino_acids="ACDEFGHIKLMNPQRSTVWY"):
    aa_list = list(amino_acids)
    num_aa = len(aa_list)
    aa2idx = {aa: i for i, aa in enumerate(aa_list)}
    aa_freq_matrix = np.zeros((num_aa, len(seqs)))
    for i, seq_str in enumerate(tqdm(seqs, desc="统计频率")):
        if not seq_str:
            continue
        int_seq = np.array([aa2idx.get(aa, -1) for aa in seq_str], dtype=int)
        valid_indices = int_seq[int_seq != -1]
        total_len = len(valid_indices)
        if total_len == 0:
            continue
        counts = np.bincount(valid_indices, minlength=num_aa)
        aa_freq_matrix[:, i] = counts / total_len
    print("频率矩阵 shape:", aa_freq_matrix.shape)
    print("准备调用 np.corrcoef ...")
    corr_matrix = np.corrcoef(aa_freq_matrix)
    print("np.corrcoef 完成")
    return corr_matrix

# --- 使用优化后的函数 ---
print("计算正样本氨基酸相关性...")
pos_corr = get_aa_correlation_matrix_optimized(pos_seqs_final)
print("计算负样本氨基酸相关性...")
neg_corr = get_aa_correlation_matrix_optimized(neg_seqs_final)

print(f"正样本序列数: {len(pos_seqs)}")
print(f"负样本序列数: {len(neg_seqs)}")
print(f"相关性矩阵形状: {pos_corr.shape}")
print("计算完成！")



In [7]:
import numpy
print(numpy.__version__)

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from matplotlib.patches import Rectangle

# 假设你已经有 pos_corr, neg_corr, amino_acids
pos_corr_df = pd.DataFrame(pos_corr, index=amino_acids, columns=amino_acids)
neg_corr_df = pd.DataFrame(neg_corr, index=amino_acids, columns=amino_acids)


def plot_corr_beauty(corr_df, title, save_path, highlight_boxes=None):
    n = len(corr_df)
    plt.figure(figsize=(10, 9))
    ax = plt.gca()
    # 右上角（含主对角线）：画圆圈，主对角线只显示字母
    for i in range(n):
        for j in range(i, n):
            val = corr_df.iloc[i, j]
            if i == j:
                # 主对角线：只显示字母
                ax.text(j, i, corr_df.columns[i], ha='center', va='center', fontsize=15, fontweight='bold', color='r')
            else:
                color = plt.cm.coolwarm((val + 0.4) / 0.8)
                size = abs(val) * 800 if abs(val) > 0.01 else 0
                ax.scatter(j, i, s=size, color=color, alpha=0.8, edgecolor='k', linewidth=0.5)
    # 左下角：显示数值
    for i in range(1, n):
        for j in range(i):
            val = corr_df.iloc[i, j]
            ax.text(j, i, f"{val:.2f}", ha='center', va='center', fontsize=10, color='w' if abs(val) > 0.2 else 'gray')
    # 高亮部分区域
    if highlight_boxes:
        for (i0, j0, w, h) in highlight_boxes:
            rect = Rectangle((j0 - 0.5, i0 - 0.5), w, h, linewidth=2, edgecolor='red', facecolor='none', linestyle='--')
            ax.add_patch(rect)
    ax.set_xticks(range(n))
    ax.set_yticks(range(n))
    ax.set_xticklabels(corr_df.columns, fontsize=13)
    ax.set_yticklabels(corr_df.index, fontsize=13)
    ax.set_xlim(-0.5, n - 0.5)
    ax.set_ylim(n - 0.5, -0.5)
    plt.title(title, fontsize=16)
    sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=plt.Normalize(vmin=-0.4, vmax=0.4))
    cbar = plt.colorbar(sm, ax=ax, fraction=0.045, pad=0.03, orientation='horizontal')
    cbar.set_label('Correlation', fontsize=13)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.show()
    plt.close()


save_dir = '/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis/'
os.makedirs(save_dir, exist_ok=True)
highlight_boxes = [(13, 14, 2, 2), (16, 18, 3, 3)]  # 可根据需要调整

plot_corr_beauty(pos_corr_df, 'Amino Acid Correlation (Positive, Beauty)',
                 os.path.join(save_dir, 'positive_aa_correlation_beauty.svg'), highlight_boxes)
plot_corr_beauty(neg_corr_df, 'Amino Acid Correlation (Negative, Beauty)',
                 os.path.join(save_dir, 'negative_aa_correlation_beauty.svg'), highlight_boxes)
# Cell 1: 导入依赖

In [2]:


import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd

# Cell 2: 加载数据
attn_weights_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/attn_weights.npy"
gate_scores_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/gate_scores.npy"
topk_idx_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/topk_idx.npy"
token_types_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/test_token_types.npy"

attn_weights = np.load(attn_weights_path)  # (4, 1149, 300, 300)
gate_scores = np.load(gate_scores_path)  # (4, 1149, 300, 30)
topk_idx = np.load(topk_idx_path)  # (4, 1149, 300, 3)
token_types = np.load(token_types_path)  # (1149, 300)

print("数据形状:")
print(f"attn_weights: {attn_weights.shape}")
print(f"gate_scores: {gate_scores.shape}")
print(f"topk_idx: {topk_idx.shape}")
print(f"token_types: {token_types.shape}")

# Cell 3: 选择要分析的序列和层
seq_idx = 0  # 第一条序列
layer_idx = 0  # 第一层
seq_len = 140  # 实际序列长度（去除padding）

# 获取该序列的数据
seq_attn = attn_weights[layer_idx, seq_idx, :seq_len, :seq_len]  # (seq_len, seq_len)
seq_gate = gate_scores[layer_idx, seq_idx, :seq_len]  # (seq_len, 30)
seq_experts = topk_idx[layer_idx, seq_idx, :seq_len]  # (seq_len, 3)
seq_tokens = token_types[seq_idx, :seq_len]  # (seq_len,)

print(f"分析序列 {seq_idx}，层 {layer_idx}，长度 {seq_len}")


# Cell 4: 准备桑基图数据 - Token到Expert的分配
def prepare_sankey_data(seq_attn, seq_gate, seq_experts, seq_tokens, top_k=3):
    """
    准备桑基图数据：Token -> Expert -> Attention Weight
    """
    # 节点定义
    token_nodes = [f"Token_{i}" for i in range(len(seq_tokens))]

    expert_nodes = [f"Expert_{eid}" for eid in range(30)]

    # 边数据
    source = []
    target = []
    value = []
    color = []

    # Token到Expert的连接
    for token_idx in range(len(seq_tokens)):
        if seq_tokens[token_idx] == -1:  # 跳过padding
            continue

        for k in range(top_k):
            expert_id = seq_experts[token_idx, k]
            gate_score = seq_gate[token_idx, expert_id]

            source.append(token_idx)
            target.append(len(token_nodes) + expert_id)
            value.append(gate_score)
            color.append(f"rgba(100, 149, 237, {gate_score})")

    # Expert到Attention的连接
    attention_in = seq_attn.sum(axis=0)  # 每个token被关注的程度

    for expert_id in range(30):
        # 找出该专家处理的token
        expert_tokens = []
        for token_idx in range(len(seq_tokens)):
            if expert_id in seq_experts[token_idx]:
                expert_tokens.append(token_idx)

        if expert_tokens:
            # 计算该专家处理的token的平均注意力
            avg_attention = np.mean([attention_in[t] for t in expert_tokens])

            source.append(len(token_nodes) + expert_id)
            target.append(len(token_nodes) + 30)  # 虚拟的"Attention"节点
            value.append(avg_attention)
            color.append(f"rgba(255, 99, 71, {avg_attention})")

    return token_nodes, expert_nodes, source, target, value, color


token_nodes, expert_nodes, source, target, value, color = prepare_sankey_data(
    seq_attn, seq_gate, seq_experts, seq_tokens
)


# Cell 5: 创建桑基图
def create_sankey_diagram(token_nodes, expert_nodes, source, target, value, color):
    """
    创建桑基图
    """
    # 所有节点
    all_nodes = token_nodes + expert_nodes + ["Attention"]
    node_colors = ["lightblue"] * len(token_nodes) + ["lightgreen"] * len(expert_nodes) + ["lightcoral"]

    # 创建桑基图
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_nodes,
            color=node_colors
        ),
        link=dict(
            source=source,
            target=target,
            value=value,
            color=color
        )
    )])

    fig.update_layout(
        title_text=f"Token-Expert-Attention Flow (Sequence {seq_idx}, Layer {layer_idx})",
        font_size=10,
        height=800,
    )

    return fig


fig = create_sankey_diagram(token_nodes, expert_nodes, source, target, value, color)
fig.show()
# Cell 6: 保存桑基图
save_path = f"/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis/sankey_seq{seq_idx}_layer{layer_idx}.html"
fig.write_html(save_path)
print(f"桑基图已保存到: {save_path}")
# Cell 1: 导入依赖
import numpy as np
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt


# Cell 2: 读取 FASTA 格式的氨基酸序列
def read_fasta(filepath):
    seqs = []
    with open(filepath, 'r') as f:
        seq = ''
        for line in f:
            if line.startswith('>'):
                if seq:
                    seqs.append(seq)
                    seq = ''
            else:
                seq += line.strip()
        if seq:
            seqs.append(seq)
    return seqs


# 读取正负样本序列
pos_train = read_fasta('/exp_data/sjx/star/first_data/shisuandanbai/positive_train.fasta')
pos_test = read_fasta('/exp_data/sjx/star/first_data/shisuandanbai/positive_test.fasta')
neg_train = read_fasta('/exp_data/sjx/star/first_data/shisuandanbai/negative_train.fasta')
neg_test = read_fasta('/exp_data/sjx/star/first_data/shisuandanbai/negative_test.fasta')

pos_seqs = pos_train + pos_test
neg_seqs = neg_train + neg_test

print(f"正样本序列数: {len(pos_seqs)}")
print(f"负样本序列数: {len(neg_seqs)}")
print(f"第一条正样本序列: {pos_seqs[0][:50]}...")
# Cell 3: 设置氨基酸顺序
amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
num_aa = len(amino_acids)
print("氨基酸列表:", amino_acids)


# Cell 4: 计算正样本氨基酸相关性矩阵
def get_aa_correlation_matrix(seqs):
    """
    计算20种氨基酸之间的相关性矩阵
    seqs: list of strings, 每条序列是氨基酸字符串
    """
    aa_freq_matrix = np.zeros((len(amino_acids), len(seqs)))

    for i, seq in enumerate(seqs):
        # 只统计真实氨基酸，不统计其他字符
        valid_aa = [aa for aa in seq if aa in amino_acids]
        total_len = len(valid_aa)
        if total_len == 0:
            print(f"Warning: sequence {i} has no valid amino acids!")
            continue
        c = Counter(valid_aa)
        for j, aa in enumerate(amino_acids):
            aa_freq_matrix[j, i] = c[aa] / total_len

    corr_matrix = np.corrcoef(aa_freq_matrix)
    return corr_matrix


print("计算正样本氨基酸相关性...")
pos_corr = get_aa_correlation_matrix(pos_seqs)
# Cell 5: 计算负样本氨基酸相关性矩阵
print("计算负样本氨基酸相关性...")
neg_corr = get_aa_correlation_matrix(neg_seqs)
# Cell 6: 检查相关性矩阵
print("正样本相关性矩阵最大值：", np.max(pos_corr))
print("正样本相关性矩阵最小值：", np.min(pos_corr))
print("负样本相关性矩阵最大值：", np.max(neg_corr))
print("负样本相关性矩阵最小值：", np.min(neg_corr))
# Cell 7: 转换为DataFrame
pos_corr_df = pd.DataFrame(pos_corr, index=amino_acids, columns=amino_acids)
neg_corr_df = pd.DataFrame(neg_corr, index=amino_acids, columns=amino_acids)


# Cell 8: 画图函数
def plot_corr_beauty(corr_df, title, save_path, highlight_boxes=None):
    n = len(corr_df)
    plt.figure(figsize=(10, 9))
    ax = plt.gca()
    # 右上角（含主对角线）：画圆圈，主对角线只显示字母
    for i in range(n):
        for j in range(i, n):
            val = corr_df.iloc[i, j]
            if i == j:
                # 主对角线：只显示字母
                ax.text(j, i, corr_df.columns[i], ha='center', va='center', fontsize=15, fontweight='bold', color='r')
            else:
                color = plt.cm.coolwarm((val + 0.4) / 0.8)
                size = abs(val) * 800 if abs(val) > 0.01 else 0
                ax.scatter(j, i, s=size, color=color, alpha=0.8, edgecolor='k', linewidth=0.5)
    # 左下角：显示数值
    for i in range(1, n):
        for j in range(i):
            val = corr_df.iloc[i, j]
            ax.text(j, i, f"{val:.2f}", ha='center', va='center', fontsize=10, color='w' if abs(val) > 0.2 else 'gray')
    # 高亮部分区域
    if highlight_boxes:
        for (i0, j0, w, h) in highlight_boxes:
            rect = Rectangle((j0 - 0.5, i0 - 0.5), w, h, linewidth=2, edgecolor='red', facecolor='none', linestyle='--')
            ax.add_patch(rect)
    ax.set_xticks(range(n))
    ax.set_yticks(range(n))
    ax.set_xticklabels(corr_df.columns, fontsize=13)
    ax.set_yticklabels(corr_df.index, fontsize=13)
    ax.set_xlim(-0.5, n - 0.5)
    ax.set_ylim(n - 0.5, -0.5)
    plt.title(title, fontsize=16)
    sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=plt.Normalize(vmin=-0.4, vmax=0.4))
    cbar = plt.colorbar(sm, ax=ax, fraction=0.045, pad=0.03, orientation='horizontal')
    cbar.set_label('Correlation', fontsize=13)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.show()


In [3]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import os

# 加载数据
attn_weights_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/attn_weights.npy"
gate_scores_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/gate_scores.npy"
topk_idx_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/topk_idx.npy"
token_types_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/test_token_types.npy"

attn_weights = np.load(attn_weights_path)  # (4, 1149, 300, 300)
gate_scores = np.load(gate_scores_path)    # (4, 1149, 300, 30)
topk_idx = np.load(topk_idx_path)          # (4, 1149, 300, 3)
token_types = np.load(token_types_path)    # (1149, 300)

# 配置要分析的层和序列
layer_idx = 3     # 分析最后一层，通常最重要
seq_idx = 42      # 选择一条代表性序列
max_seq_len = 100 # 限制序列长度，使图更清晰
k_value = 3       # 每个token使用的专家数

# 计算实际序列长度（去除padding）
seq_len = min((token_types[seq_idx] != -1).sum(), max_seq_len)

# 获取该序列的数据
seq_attn = attn_weights[layer_idx, seq_idx, :seq_len, :seq_len]
seq_gate = gate_scores[layer_idx, seq_idx, :seq_len]
seq_experts = topk_idx[layer_idx, seq_idx, :seq_len]

# 分析专家使用情况
experts_usage = np.zeros(30)
for token_idx in range(seq_len):
    for k in range(k_value):
        expert_id = seq_experts[token_idx, k]
        gate_val = seq_gate[token_idx, expert_id]
        experts_usage[expert_id] += gate_val

# 选择使用率最高的TOP-10专家
top_experts_idx = np.argsort(experts_usage)[-10:][::-1]
top_experts_usage = experts_usage[top_experts_idx]

# 准备桑基图数据
def prepare_advanced_sankey_data():
    # 1. 定义节点
    # 第一列：氨基酸残基位置（用位置表示）
    amino_nodes = [f"Pos_{i+1}" for i in range(seq_len)]
    
    # 第二列：专家组（只包括使用率TOP-10的专家）
    expert_nodes = [f"Expert_{i}" for i in top_experts_idx]
    
    # 第三列：主要关注区域（根据attention map聚类）
    # 这里我们将序列分为N个区域
    n_regions = 5
    region_size = seq_len // n_regions
    region_nodes = [f"Region_{i+1}" for i in range(n_regions)]
    
    # 2. 计算边
    sources = []
    targets = []
    values = []
    colors = []
    
    # 位置到专家的连接
    for pos in range(seq_len):
        for k in range(k_value):
            expert_id = seq_experts[pos, k]
            # 只关注TOP-10专家
            if expert_id in top_experts_idx:
                expert_idx = list(top_experts_idx).index(expert_id)
                gate_value = seq_gate[pos, expert_id]
                
                # 只考虑有意义的连接
                if gate_value > 0.1:
                    sources.append(pos)
                    targets.append(seq_len + expert_idx)
                    values.append(gate_value * 100)  # 放大比例以使图形更清晰
                    colors.append(f"rgba(86, 180, 233, {min(gate_value, 0.9)})")
    
    # 专家到区域的连接
    # 计算每个专家对每个区域的贡献
    for e_idx, expert_id in enumerate(top_experts_idx):
        # 计算该专家处理的token的注意力分布
        expert_attention = np.zeros(n_regions)
        
        for pos in range(seq_len):
            # 检查该位置是否使用了这个专家
            if expert_id in seq_experts[pos]:
                # 计算该位置关注的区域
                for r_idx in range(n_regions):
                    start = r_idx * region_size
                    end = min((r_idx + 1) * region_size, seq_len)
                    # 累加该位置对每个区域的注意力
                    region_attn = seq_attn[pos, start:end].sum()
                    expert_attention[r_idx] += region_attn
        
        # 归一化专家到区域的注意力分数
        if expert_attention.sum() > 0:
            expert_attention = expert_attention / expert_attention.sum()
            
            # 添加专家到区域的连接
            for r_idx, attn in enumerate(expert_attention):
                if attn > 0.05:  # 过滤小值
                    sources.append(seq_len + e_idx)
                    targets.append(seq_len + len(top_experts_idx) + r_idx)
                    values.append(attn * 200)  # 放大注意力值
                    colors.append(f"rgba(230, 159, 0, {min(attn * 2, 0.9)})")
    
    # 3. 节点颜色和标签
    node_colors = []
    node_labels = []
    
    # 氨基酸位置节点
    for i in range(seq_len):
        node_colors.append("rgba(86, 180, 233, 0.8)")
        if i % 10 == 0:  # 只在每10个位置显示标签，避免拥挤
            node_labels.append(f"Pos {i+1}")
        else:
            node_labels.append("")
    
    # 专家节点
    for expert_id in top_experts_idx:
        node_colors.append("rgba(0, 158, 115, 0.8)")
        node_labels.append(f"Expert {expert_id}")
    
    # 区域节点
    for i in range(n_regions):
        node_colors.append("rgba(240, 228, 66, 0.8)")
        start = i * region_size + 1
        end = min((i + 1) * region_size, seq_len)
        node_labels.append(f"Region {i+1}\n({start}-{end})")
    
    all_nodes = amino_nodes + expert_nodes + region_nodes
    
    return node_labels, node_colors, sources, targets, values, colors

# 生成桑基图数据
node_labels, node_colors, sources, targets, values, colors = prepare_advanced_sankey_data()

# 创建桑基图
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=node_labels,
        color=node_colors
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=colors
    )
)])

# 设置图表标题和样式
fig.update_layout(
    title_text=f"MoE Expert Flow Analysis: Layer {layer_idx+1}, Sequence {seq_idx}",
    font_size=12,
    height=800,
    width=1000
)

# 保存图表
save_dir = "/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, f"advanced_sankey_layer{layer_idx+1}_seq{seq_idx}.html")
fig.write_html(save_path)

# 添加可视化专家使用情况的柱状图
plt.figure(figsize=(12, 6))
bars = plt.bar(range(len(top_experts_idx)), top_experts_usage, color='skyblue')

# 为每个柱状图添加标签
for i, bar in enumerate(bars):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"Expert {top_experts_idx[i]}", ha='center', rotation=45)

plt.xlabel("Top Experts", fontsize=14)
plt.ylabel("Usage Score", fontsize=14)
plt.title(f"Top-10 Expert Usage (Layer {layer_idx+1}, Sequence {seq_idx})", fontsize=16)
plt.tight_layout()

# 保存柱状图
expert_usage_path = os.path.join(save_dir, f"expert_usage_layer{layer_idx+1}_seq{seq_idx}.svg")
plt.savefig(expert_usage_path, format="svg", bbox_inches="tight")

print(f"桑基图已保存到: {save_path}")
print(f"专家使用情况图已保存到: {expert_usage_path}")

In [5]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import os

# 加载数据
attn_weights_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/attn_weights.npy"
gate_scores_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/gate_scores.npy"
topk_idx_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/topk_idx.npy"
token_types_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/test_token_types.npy"

attn_weights = np.load(attn_weights_path)  # (4, 1149, 300, 300)
gate_scores = np.load(gate_scores_path)    # (4, 1149, 300, 30)
topk_idx = np.load(topk_idx_path)          # (4, 1149, 300, 3)
token_types = np.load(token_types_path)    # (1149, 300)

# 配置要分析的层和序列 - 使用第一条序列
layer_idx = 3     # 分析最后一层
seq_idx = 0       # 第一条序列
seq_len = 140     # 第一条序列的长度
k_value = 3       # 每个token使用的专家数

print(f"分析序列 {seq_idx} (层 {layer_idx+1})，长度 {seq_len}")

# 获取该序列的数据
seq_attn = attn_weights[layer_idx, seq_idx, :seq_len, :seq_len]
seq_gate = gate_scores[layer_idx, seq_idx, :seq_len]
seq_experts = topk_idx[layer_idx, seq_idx, :seq_len]

# 分析专家使用情况 - 修正计算方式
experts_usage = np.zeros(30)
for token_idx in range(seq_len):
    for k in range(k_value):
        expert_id = seq_experts[token_idx, k]
        gate_val = seq_gate[token_idx, expert_id]
        experts_usage[expert_id] += gate_val

# 选择使用率最高的TOP-8专家
top_experts_idx = np.argsort(experts_usage)[-8:][::-1]
top_experts_usage = experts_usage[top_experts_idx]
print(f"使用率最高的8个专家: {top_experts_idx}")

# 准备桑基图数据 - 修正版
def prepare_enhanced_sankey_data():
    # 1. 定义节点分组
    n_pos_groups = 8  # 将140个位置分为8组
    pos_group_size = seq_len // n_pos_groups
    
    # 位置组节点标签
    pos_group_nodes = [f"Pos {g*pos_group_size+1}-{min((g+1)*pos_group_size, seq_len)}" 
                      for g in range(n_pos_groups)]
    
    # 专家节点标签
    expert_nodes = [f"Expert {i}" for i in top_experts_idx]
    
    # 区域节点标签
    n_regions = 4  # 4个区域更易于可视化
    region_size = seq_len // n_regions
    region_nodes = [f"Region {i+1}\n({i*region_size+1}-{min((i+1)*region_size, seq_len)})" 
                   for i in range(n_regions)]
    
    # 2. 初始化权重矩阵
    pos_to_expert_weight = np.zeros((n_pos_groups, len(top_experts_idx)))
    expert_to_region_weight = np.zeros((len(top_experts_idx), n_regions))
    
    # 3. 计算位置组到专家的权重
    for pos in range(seq_len):
        pos_group = min(pos // pos_group_size, n_pos_groups - 1)
        
        for k in range(k_value):
            expert_id = seq_experts[pos, k]
            if expert_id in top_experts_idx:
                expert_idx = list(top_experts_idx).index(expert_id)
                gate_value = seq_gate[pos, expert_id]
                pos_to_expert_weight[pos_group, expert_idx] += gate_value
    
    # 4. 计算专家到区域的权重
    for pos in range(seq_len):
        # 计算该位置的注意力分布
        attn_dist = seq_attn[pos]
        
        for k in range(k_value):
            expert_id = seq_experts[pos, k]
            if expert_id in top_experts_idx:
                expert_idx = list(top_experts_idx).index(expert_id)
                gate_value = seq_gate[pos, expert_id]
                
                # 计算该专家关注的区域
                for r_idx in range(n_regions):
                    start = r_idx * region_size
                    end = min((r_idx + 1) * region_size, seq_len)
                    region_attn = attn_dist[start:end].sum() * gate_value
                    expert_to_region_weight[expert_idx, r_idx] += region_attn
    
    # 5. 对每个专家的区域权重进行归一化
    for expert_idx in range(len(top_experts_idx)):
        if expert_to_region_weight[expert_idx].sum() > 0:
            expert_to_region_weight[expert_idx] = expert_to_region_weight[expert_idx] / expert_to_region_weight[expert_idx].sum()
    
    # 6. 构建桑基图链接数据
    sources = []
    targets = []
    values = []
    colors = []
    
    # 添加位置到专家的边
    for pos_group in range(n_pos_groups):
        for expert_idx in range(len(top_experts_idx)):
            weight = pos_to_expert_weight[pos_group, expert_idx]
            if weight > 0.05:  # 过滤微小连接
                sources.append(pos_group)
                targets.append(n_pos_groups + expert_idx)
                values.append(weight * 15)  # 放大权重使图形更清晰
                colors.append(f"rgba(86, 180, 233, {min(weight*1.5, 0.9)})")
    
    # 添加专家到区域的边
    for expert_idx in range(len(top_experts_idx)):
        for r_idx in range(n_regions):
            weight = expert_to_region_weight[expert_idx, r_idx]
            if weight > 0.05:  # 过滤微小连接
                sources.append(n_pos_groups + expert_idx)
                targets.append(n_pos_groups + len(top_experts_idx) + r_idx)
                values.append(weight * 20)  # 放大权重
                colors.append(f"rgba(230, 159, 0, {min(weight*1.5, 0.9)})")
    
    # 7. 设置节点颜色
    pos_colors = ["rgba(86, 180, 233, 0.9)"] * n_pos_groups  # 蓝色
    expert_colors = ["rgba(0, 158, 115, 0.9)"] * len(top_experts_idx)  # 绿色
    region_colors = ["rgba(240, 228, 66, 0.9)"] * n_regions  # 黄色
    
    all_node_colors = pos_colors + expert_colors + region_colors
    all_nodes = pos_group_nodes + expert_nodes + region_nodes
    
    return all_nodes, all_node_colors, sources, targets, values, colors

# 生成桑基图数据
nodes, node_colors, sources, targets, values, colors = prepare_enhanced_sankey_data()

# 创建桑基图
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        color=node_colors
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=colors
    )
)])

# 设置图表标题和样式
fig.update_layout(
    title_text=f"MoE三级流动分析: 位置组 → 专家 → 区域<br>层 {layer_idx+1}, 序列 {seq_idx} (长度 {seq_len})",
    font=dict(size=14, family="Arial"),
    height=800,
    width=1000,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
)

# 保存图表
save_dir = "/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, f"three_level_sankey_seq{seq_idx}_layer{layer_idx+1}.html")
fig.write_html(save_path)

print(f"三级桑基图已保存到: {save_path}")

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
from sklearn.decomposition import PCA
from matplotlib.colors import LinearSegmentedColormap

# 创建保存目录
save_dir = "/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis/info_flow"
os.makedirs(save_dir, exist_ok=True)

# 加载数据
attn_weights_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/attn_weights.npy"
gate_scores_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/gate_scores.npy"
topk_idx_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/topk_idx.npy"
token_types_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/test_token_types.npy"

print("加载数据中...")
attn_weights = np.load(attn_weights_path)  # (4, 1149, 300, 300)
gate_scores = np.load(gate_scores_path)    # (4, 1149, 300, 30)
topk_idx = np.load(topk_idx_path)          # (4, 1149, 300, 3)
token_types = np.load(token_types_path)    # (1149, 300)

num_layers, num_seqs, seq_len, _ = attn_weights.shape
num_experts = gate_scores.shape[-1]
topk = topk_idx.shape[-1]

print(f"数据加载完成，形状: layers={num_layers}, seqs={num_seqs}, len={seq_len}, experts={num_experts}")

# 1. 计算每个专家的注意力模式
def analyze_expert_attention_patterns(layer_idx=3):
    """分析每个专家的注意力模式"""
    print(f"分析层 {layer_idx+1} 的专家注意力模式...")
    
    # 初始化累积注意力和计数
    expert_attention_sum = np.zeros((num_experts, seq_len, seq_len))
    expert_token_count = np.zeros(num_experts)
    
    # 收集每个专家处理的token的注意力模式
    for seq_idx in range(num_seqs):
        # 确定该序列的有效长度
        valid_len = min(seq_len, (token_types[seq_idx] != -1).sum())
        if valid_len == 0:
            continue
            
        seq_attn = attn_weights[layer_idx, seq_idx, :valid_len, :valid_len]
        seq_experts = topk_idx[layer_idx, seq_idx, :valid_len]
        seq_gate = gate_scores[layer_idx, seq_idx, :valid_len]
        
        # 为每个位置累加注意力权重
        for pos in range(valid_len):
            for k in range(topk):
                expert_id = seq_experts[pos, k]
                gate_value = seq_gate[pos, expert_id]
                
                # 加权累加注意力
                attn_pattern = seq_attn[pos, :valid_len]
                expert_attention_sum[expert_id, :valid_len, :valid_len] += gate_value * np.outer(
                    np.ones(valid_len), attn_pattern)
                expert_token_count[expert_id] += gate_value
    
    # 归一化注意力模式
    expert_attention_patterns = np.zeros_like(expert_attention_sum)
    for eid in range(num_experts):
        if expert_token_count[eid] > 0:
            expert_attention_patterns[eid] = expert_attention_sum[eid] / expert_token_count[eid]
    
    return expert_attention_patterns

# 2. 可视化专家注意力模式
def visualize_expert_attention_patterns(expert_patterns, layer_idx=3, top_n=10):
    """可视化专家注意力模式，只展示top_n个专家"""
    print("可视化专家注意力模式...")
    
    # 计算每个专家的平均注意力强度
    expert_attn_strength = np.array([pattern.mean() for pattern in expert_patterns])
    top_experts = np.argsort(expert_attn_strength)[-top_n:]
    
    # 创建自定义colormap
    cmap = LinearSegmentedColormap.from_list(
        'custom_cmap', 
        [(0, 'white'), (0.3, '#ffffcc'), (0.6, '#a1dab4'), (0.8, '#41b6c4'), (1, '#225ea8')],
        N=256
    )
    
    # 可视化每个专家的平均注意力模式
    n_cols = 5
    n_rows = (top_n + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 3.5*n_rows))
    
    # 将axes展平
    if n_rows > 1:
        axes = axes.flatten()
    else:
        axes = [axes] if top_n == 1 else axes
    
    for i, eid in enumerate(top_experts):
        if i < len(axes):
            ax = axes[i]
            
            # 获取100x100的注意力矩阵
            attn_pattern = expert_patterns[eid, :100, :100]
            
            # 绘制热图
            im = ax.imshow(attn_pattern, aspect='equal', 
                        interpolation='none', cmap=cmap, vmin=0, vmax=attn_pattern.max())
            
            # 设置标题和标签
            strength = expert_attn_strength[eid]
            ax.set_title(f"Expert {eid}\nStrength: {strength:.3f}", fontsize=11)
            
            # 隐藏刻度
            ax.set_xticks([])
            ax.set_yticks([])
            
            # 添加坐标轴标签
            if i >= len(top_experts) - n_cols:  # 最后一行
                ax.set_xlabel("Query Position", fontsize=9)
            if i % n_cols == 0:  # 第一列
                ax.set_ylabel("Key Position", fontsize=9)
    
    # 隐藏多余的subplot
    for i in range(top_n, len(axes)):
        axes[i].axis('off')
    
    # 添加colorbar
    cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
    cbar = fig.colorbar(im, cax=cbar_ax)
    cbar.set_label("Attention Weight", fontsize=10)
    
    plt.suptitle(f"Layer {layer_idx+1} Top {top_n} Expert Attention Patterns", 
                fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9, right=0.9)
    
    # 保存图像
    fig_path = os.path.join(save_dir, f"expert_attention_patterns_layer{layer_idx+1}_top{top_n}.pdf")
    plt.savefig(fig_path, format='pdf', bbox_inches='tight')
    print(f"图像已保存至: {fig_path}")
    plt.close()

# 3. 分析专家信息流向
def analyze_expert_information_flow(layer_idx=3, n_clusters=5):
    """分析专家处理的token的信息流向"""
    print(f"分析层 {layer_idx+1} 的信息流向...")
    
    # 创建边界位置列表，用于区分序列区域
    seq_region_boundaries = np.linspace(0, seq_len, n_clusters+1, dtype=int)
    regions = [(seq_region_boundaries[i], seq_region_boundaries[i+1]) 
               for i in range(len(seq_region_boundaries)-1)]
    
    # 初始化统计矩阵: expert -> (from_region, to_region)
    expert_region_flow = np.zeros((num_experts, n_clusters, n_clusters))
    expert_token_count = np.zeros(num_experts)
    
    # 收集每个专家处理的token的信息流向
    for seq_idx in range(num_seqs):
        # 确定该序列的有效长度
        valid_len = min(seq_len, (token_types[seq_idx] != -1).sum())
        if valid_len == 0:
            continue
            
        seq_attn = attn_weights[layer_idx, seq_idx, :valid_len, :valid_len]
        seq_experts = topk_idx[layer_idx, seq_idx, :valid_len]
        seq_gate = gate_scores[layer_idx, seq_idx, :valid_len]
        
        # 确定每个位置的区域索引
        pos_to_region = np.zeros(valid_len, dtype=int)
        for r_idx, (start, end) in enumerate(regions):
            mask = (start <= np.arange(valid_len)) & (np.arange(valid_len) < min(end, valid_len))
            pos_to_region[mask] = r_idx
        
        # 为每个专家收集流向数据
        for pos in range(valid_len):
            from_region = pos_to_region[pos]
            
            # 计算该位置的注意力分布
            attn_dist = seq_attn[pos, :valid_len]
            
            # 计算对每个区域的注意力总和
            region_attn = np.zeros(n_clusters)
            for r_idx in range(n_clusters):
                start, end = regions[r_idx]
                end = min(end, valid_len)
                if start < end:
                    region_attn[r_idx] = attn_dist[start:end].sum()
            
            # 归一化区域注意力
            region_attn = region_attn / (region_attn.sum() + 1e-10)
            
            # 更新每个专家的流向统计
            for k in range(topk):
                expert_id = seq_experts[pos, k]
                gate_value = seq_gate[pos, expert_id]
                
                # 累加从from_region到各个区域的注意力分布
                expert_region_flow[expert_id, from_region, :] += gate_value * region_attn
                expert_token_count[expert_id] += gate_value
    
    # 归一化每个专家的流向数据
    for eid in range(num_experts):
        if expert_token_count[eid] > 0:
            # 对每个源区域归一化
            for from_region in range(n_clusters):
                total = expert_region_flow[eid, from_region].sum()
                if total > 0:
                    expert_region_flow[eid, from_region] /= total
    
    return expert_region_flow, regions

# 4. 可视化专家信息流向
def visualize_expert_information_flow(expert_flow, regions, layer_idx=3, top_n=8):
    """可视化专家信息流向，只展示top_n个专家"""
    print("可视化专家信息流向...")
    
    # 计算每个专家的信息流动强度
    expert_flow_strength = np.array([flow.sum() for flow in expert_flow])
    top_experts = np.argsort(expert_flow_strength)[-top_n:]
    
    n_clusters = expert_flow.shape[1]
    n_cols = min(4, top_n)
    n_rows = (top_n + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 3.5*n_rows))
    
    # 将axes展平
    if n_rows > 1:
        axes = axes.flatten()
    else:
        axes = [axes] if top_n == 1 else axes
    
    # 创建区域标签
    region_labels = [f"Region {i+1}\n({start}-{end-1})" for i, (start, end) in enumerate(regions)]
    
    for i, eid in enumerate(top_experts):
        if i < len(axes):
            ax = axes[i]
            
            # 绘制热图
            flow_matrix = expert_flow[eid]
            sns.heatmap(flow_matrix, annot=True, fmt=".2f", cmap="YlGnBu", 
                        xticklabels=region_labels, yticklabels=region_labels,
                        ax=ax, cbar=False)
            
            # 设置标题
            ax.set_title(f"Expert {eid}", fontsize=12)
            
            # 设置标签
            if i >= len(top_experts) - n_cols:  # 最后一行
                ax.set_xlabel("To Region", fontsize=11)
            if i % n_cols == 0:  # 第一列
                ax.set_ylabel("From Region", fontsize=11)
    
    # 隐藏多余的subplot
    for i in range(top_n, len(axes)):
        axes[i].axis('off')
    
    # 添加一个共享的colorbar
    cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
    sm = plt.cm.ScalarMappable(cmap="YlGnBu")
    sm.set_array([0, 1])
    cbar = fig.colorbar(sm, cax=cbar_ax)
    cbar.set_label("Normalized Flow Strength", fontsize=10)
    
    plt.suptitle(f"Layer {layer_idx+1} Top {top_n} Expert Information Flow", 
                fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9, right=0.9)
    
    # 保存图像
    fig_path = os.path.join(save_dir, f"expert_information_flow_layer{layer_idx+1}_top{top_n}.pdf")
    plt.savefig(fig_path, format='pdf', bbox_inches='tight')
    print(f"图像已保存至: {fig_path}")
    plt.close()

# 5. 分析专家协作模式
def analyze_expert_cooperation(layer_idx=3):
    """分析专家之间的协作模式"""
    print(f"分析层 {layer_idx+1} 的专家协作模式...")
    
    # 初始化协作矩阵
    expert_coop = np.zeros((num_experts, num_experts))
    
    # 统计每对专家共同处理同一位置的次数
    for seq_idx in range(num_seqs):
        valid_len = min(seq_len, (token_types[seq_idx] != -1).sum())
        if valid_len == 0:
            continue
            
        seq_experts = topk_idx[layer_idx, seq_idx, :valid_len]
        seq_gate = gate_scores[layer_idx, seq_idx, :valid_len]
        
        # 对每个位置，统计专家协作
        for pos in range(valid_len):
            # 获取该位置使用的专家及其门控分数
            pos_experts = seq_experts[pos]
            pos_gates = np.array([seq_gate[pos, eid] for eid in pos_experts])
            
            # 更新协作矩阵
            for i, e1 in enumerate(pos_experts):
                for j, e2 in enumerate(pos_experts):
                    if i != j:
                        # 使用几何平均作为协作强度
                        coop_score = np.sqrt(pos_gates[i] * pos_gates[j])
                        expert_coop[e1, e2] += coop_score
    
    # 归一化协作矩阵
    expert_coop_norm = expert_coop / (np.sum(expert_coop) + 1e-10)
    
    return expert_coop_norm

# 6. 可视化专家协作网络
def visualize_expert_cooperation(expert_coop, layer_idx=3, threshold=0.01):
    """可视化专家协作网络"""
    print("可视化专家协作网络...")
    
    # 创建协作数据框
    links = []
    for i in range(num_experts):
        for j in range(i+1, num_experts):  # 避免重复边
            if expert_coop[i, j] > threshold:
                links.append({
                    'source': f"Expert {i}",
                    'target': f"Expert {j}",
                    'weight': expert_coop[i, j]
                })
    
    df_links = pd.DataFrame(links)
    
    # 如果没有links，增加一个哑边
    if len(df_links) == 0:
        print("警告：专家协作网络中没有超过阈值的连接")
        df_links = pd.DataFrame([{'source': 'Expert 0', 'target': 'Expert 1', 'weight': 0}])
    
    # 绘制网络图
    try:
        import networkx as nx
        
        # 创建图
        G = nx.Graph()
        
        # 添加节点
        for i in range(num_experts):
            G.add_node(f"Expert {i}")
        
        # 添加边
        for _, row in df_links.iterrows():
            G.add_edge(row['source'], row['target'], weight=row['weight'])
        
        # 计算网络布局
        pos = nx.spring_layout(G, k=0.5, seed=42)
        
        # 获取边权重
        edge_weights = [G[u][v]['weight'] * 2000 for u, v in G.edges()]
        
        # 节点大小基于度中心性
        node_size = [300 * (1 + nx.degree_centrality(G)[node]) for node in G.nodes()]
        
        # 创建图形
        plt.figure(figsize=(12, 10))
        
        # 绘制边
        nx.draw_networkx_edges(
            G, pos, width=edge_weights, alpha=0.7, 
            edge_color=edge_weights, edge_cmap=plt.cm.Blues
        )
        
        # 绘制节点
        nx.draw_networkx_nodes(
            G, pos, node_size=node_size, 
            node_color=[list(nx.degree_centrality(G).values())],
            cmap=plt.cm.viridis, alpha=0.8
        )
        
        # 添加标签
        nx.draw_networkx_labels(G, pos, font_size=10)
        
        plt.title(f"Layer {layer_idx+1} Expert Cooperation Network", fontsize=16, fontweight='bold')
        plt.axis('off')
        
        # 保存图像
        fig_path = os.path.join(save_dir, f"expert_cooperation_network_layer{layer_idx+1}.pdf")
        plt.savefig(fig_path, format='pdf', bbox_inches='tight')
        print(f"图像已保存至: {fig_path}")
        plt.close()
        
    except ImportError:
        print("警告：需要安装networkx库以绘制网络图")

# 7. 分析注意力区域特化
def analyze_attention_specialization(expert_patterns, layer_idx=3):
    """分析专家的注意力区域特化"""
    print(f"分析层 {layer_idx+1} 的注意力区域特化...")
    
    n = min(100, seq_len)  # 使用前100个位置
    
    # 定义区域：对角线区域(局部)、远距离区域(全局)、前(N端)、后(C端)
    regions = {
        'Diagonal': lambda i, j: abs(i-j) <= 5,
        'Long Range': lambda i, j: abs(i-j) > n//2,
        'N-terminal': lambda i, j: i < n//4 and j < n//4,
        'C-terminal': lambda i, j: i >= 3*n//4 and j >= 3*n//4,
        'N-to-C': lambda i, j: i < n//4 and j >= 3*n//4,
        'C-to-N': lambda i, j: i >= 3*n//4 and j < n//4
    }
    
    # 计算每个专家在各个区域的注意力强度
    expert_region_attention = np.zeros((num_experts, len(regions)))
    
    for eid in range(num_experts):
        pattern = expert_patterns[eid, :n, :n]
        
        for r_idx, (region_name, region_func) in enumerate(regions.items()):
            # 创建区域掩码
            mask = np.zeros((n, n), dtype=bool)
            for i in range(n):
                for j in range(n):
                    mask[i, j] = region_func(i, j)
            
            # 计算区域注意力强度
            if mask.sum() > 0:
                expert_region_attention[eid, r_idx] = pattern[mask].mean() / pattern.mean()
    
    # 创建数据框
    region_names = list(regions.keys())
    df_region_attn = pd.DataFrame(expert_region_attention, columns=region_names)
    df_region_attn.index.name = 'Expert ID'
    
    return df_region_attn

# 8. 可视化注意力区域特化
def visualize_attention_specialization(df_region_attn, layer_idx=3, top_n=10):
    """可视化专家的注意力区域特化"""
    print("可视化注意力区域特化...")
    
    # 选择区域特化最强的专家
    specialization_score = df_region_attn.max(axis=1) - df_region_attn.min(axis=1)
    top_experts = specialization_score.nlargest(top_n).index.tolist()
    
    # 创建热图
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        df_region_attn.loc[top_experts], annot=True, fmt=".2f", 
        cmap="YlOrRd", linewidths=0.5, vmin=0, vmax=df_region_attn.values.max()
    )
    
    plt.title(f"Layer {layer_idx+1} Expert Attention Region Specialization", fontsize=15)
    plt.ylabel("Expert ID", fontsize=12)
    plt.xlabel("Attention Region", fontsize=12)
    
    # 保存图像
    fig_path = os.path.join(save_dir, f"expert_attention_specialization_layer{layer_idx+1}.pdf")
    plt.savefig(fig_path, format='pdf', bbox_inches='tight')
    print(f"图像已保存至: {fig_path}")
    plt.close()

# 执行分析
for layer_idx in range(num_layers):
    # 1-2. 分析和可视化专家注意力模式
    expert_patterns = analyze_expert_attention_patterns(layer_idx)
    visualize_expert_attention_patterns(expert_patterns, layer_idx)
    
    # 3-4. 分析和可视化专家信息流向
    expert_flow, regions = analyze_expert_information_flow(layer_idx)
    visualize_expert_information_flow(expert_flow, regions, layer_idx)
    
    # 5-6. 分析和可视化专家协作网络
    expert_coop = analyze_expert_cooperation(layer_idx)
    visualize_expert_cooperation(expert_coop, layer_idx)
    
    # 7-8. 分析和可视化注意力区域特化
    df_region_attn = analyze_attention_specialization(expert_patterns, layer_idx)
    visualize_attention_specialization(df_region_attn, layer_idx)

print("所有分析完成！")

# 添加总结报告生成
def generate_summary_report():
    """生成分析总结报告"""
    print("生成总结报告...")
    
    # 收集每层的主要统计数据
    layer_stats = []
    for layer_idx in range(num_layers):
        # 分析专家使用频率
        expert_counts = np.zeros(num_experts)
        for seq_idx in range(num_seqs):
            for pos in range(seq_len):
                experts = topk_idx[layer_idx, seq_idx, pos]
                for eid in experts:
                    expert_counts[eid] += 1
        
        # 计算专家使用分布的熵
        probs = expert_counts / (expert_counts.sum() + 1e-10)
        entropy_val = -np.sum(probs * np.log2(probs + 1e-10))
        max_entropy = np.log2(num_experts)
        
        # 添加到统计数据
        layer_stats.append({
            'layer': layer_idx + 1,
            'active_experts': np.sum(expert_counts > 0),
            'entropy': entropy_val,
            'entropy_ratio': entropy_val / max_entropy,
            'max_expert': np.argmax(expert_counts),
            'max_expert_ratio': np.max(expert_counts) / expert_counts.sum()
        })
    
    # 创建报告表格
    df_stats = pd.DataFrame(layer_stats)
    
    # 绘制统计图
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 活跃专家数
    axes[0, 0].plot(df_stats['layer'], df_stats['active_experts'], 'o-', linewidth=2)
    axes[0, 0].set_xlabel('Layer', fontsize=12)
    axes[0, 0].set_ylabel('Active Experts', fontsize=12)
    axes[0, 0].set_title('Number of Active Experts per Layer', fontsize=14)
    axes[0, 0].grid(True, linestyle='--', alpha=0.7)
    axes[0, 0].set_xticks(df_stats['layer'])
    
    # 熵比率
    axes[0, 1].plot(df_stats['layer'], df_stats['entropy_ratio'], 'o-', linewidth=2, color='orange')
    axes[0, 1].set_xlabel('Layer', fontsize=12)
    axes[0, 1].set_ylabel('Entropy Ratio', fontsize=12)
    axes[0, 1].set_title('Expert Usage Entropy Ratio per Layer', fontsize=14)
    axes[0, 1].grid(True, linestyle='--', alpha=0.7)
    axes[0, 1].set_xticks(df_stats['layer'])
    axes[0, 1].set_ylim(0, 1)
    
    # 最活跃的专家
    axes[1, 0].bar(df_stats['layer'], df_stats['max_expert'], color='green')
    axes[1, 0].set_xlabel('Layer', fontsize=12)
    axes[1, 0].set_ylabel('Expert ID', fontsize=12)
    axes[1, 0].set_title('Most Active Expert ID per Layer', fontsize=14)
    axes[1, 0].grid(True, linestyle='--', alpha=0.7)
    axes[1, 0].set_xticks(df_stats['layer'])
    
    # 最活跃专家的比例
    axes[1, 1].bar(df_stats['layer'], df_stats['max_expert_ratio'], color='purple')
    axes[1, 1].set_xlabel('Layer', fontsize=12)
    axes[1, 1].set_ylabel('Ratio', fontsize=12)
    axes[1, 1].set_title('Most Active Expert Usage Ratio', fontsize=14)
    axes[1, 1].grid(True, linestyle='--', alpha=0.7)
    axes[1, 1].set_xticks(df_stats['layer'])
    axes[1, 1].set_ylim(0, 1)
    
    plt.suptitle('MoE Information Flow Analysis Summary', fontsize=18, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    
    # 保存总结图
    summary_path = os.path.join(save_dir, "moe_information_flow_summary.pdf")
    plt.savefig(summary_path, format='pdf', bbox_inches='tight')
    print(f"总结报告已保存至: {summary_path}")
    plt.close()
    
    # 保存数据表格
    csv_path = os.path.join(save_dir, "layer_statistics.csv")
    df_stats.to_csv(csv_path, index=False)
    print(f"统计数据已保存至: {csv_path}")

# 生成总结报告
generate_summary_report()

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
import os
from matplotlib.colors import LinearSegmentedColormap

# 创建保存目录
save_dir = "/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis/expert_evolution"
os.makedirs(save_dir, exist_ok=True)

# 加载数据
topk_idx_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/topk_idx.npy"
gate_scores_path = "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/gate_scores.npy"

print("加载数据中...")
topk_idx = np.load(topk_idx_path)  # (4, 1149, 300, 3)
gate_scores = np.load(gate_scores_path)  # (4, 1149, 300, 30)

num_layers, num_seqs, seq_len, _ = topk_idx.shape
num_experts = gate_scores.shape[-1]
topk = topk_idx.shape[-1]

print(f"数据加载完成: {num_layers}层, {num_experts}专家, 每个位置分配{topk}个专家")

def analyze_expert_evolution(topk_idx, gate_scores, save_dir):
    """分析专家角色如何随层数演化"""
    num_layers = topk_idx.shape[0]
    num_experts = gate_scores.shape[-1]
    
    print(f"分析专家随层演化趋势...")
    
    # 1. 计算每一层中专家的使用频率
    layer_expert_usage = np.zeros((num_layers, num_experts))
    layer_expert_importance = np.zeros((num_layers, num_experts))
    
    for layer in range(num_layers):
        # 使用字典计数以提高效率
        expert_counts = {eid: 0 for eid in range(num_experts)}
        expert_scores = {eid: 0.0 for eid in range(num_experts)}
        
        for seq_idx in range(topk_idx.shape[1]):
            for pos in range(topk_idx.shape[2]):
                for k in range(topk_idx.shape[3]):
                    eid = topk_idx[layer, seq_idx, pos, k]
                    gate_value = gate_scores[layer, seq_idx, pos, eid]
                    expert_counts[eid] += 1
                    expert_scores[eid] += gate_value
        
        # 更新统计
        for eid in range(num_experts):
            layer_expert_usage[layer, eid] = expert_counts[eid]
            layer_expert_importance[layer, eid] = expert_scores[eid]
    
    # 归一化每一层的使用频率
    layer_expert_usage_norm = np.zeros_like(layer_expert_usage, dtype=float)
    layer_expert_importance_norm = np.zeros_like(layer_expert_importance, dtype=float)
    
    for layer in range(num_layers):
        usage_sum = layer_expert_usage[layer].sum()
        importance_sum = layer_expert_importance[layer].sum()
        
        if usage_sum > 0:
            layer_expert_usage_norm[layer] = layer_expert_usage[layer] / usage_sum
        if importance_sum > 0:
            layer_expert_importance_norm[layer] = layer_expert_importance[layer] / importance_sum
    
    # 2. 计算层与层之间的专家分工变化
    layer_changes_usage = np.zeros(num_layers-1)
    layer_changes_importance = np.zeros(num_layers-1)
    
    for i in range(num_layers-1):
        # 使用L1距离度量变化
        layer_changes_usage[i] = np.sum(np.abs(layer_expert_usage_norm[i+1] - layer_expert_usage_norm[i]))
        layer_changes_importance[i] = np.sum(np.abs(layer_expert_importance_norm[i+1] - layer_expert_importance_norm[i]))
    
    # 3. 找出每层中最主要的专家
    top_experts_per_layer = []
    for layer in range(num_layers):
        # 按重要性降序排列
        top_indices = np.argsort(layer_expert_importance_norm[layer])[::-1]
        cumulative = 0
        selected = []
        
        # 选择累计重要性超过80%的专家
        for idx in top_indices:
            selected.append(idx)
            cumulative += layer_expert_importance_norm[layer, idx]
            if cumulative > 0.8:
                break
        
        top_experts_per_layer.append(selected)
    
    # 4. 计算专家角色的稳定性
    expert_stability = np.zeros(num_experts)
    for eid in range(num_experts):
        # 计算每层中该专家重要性的标准差
        expert_stability[eid] = 1.0 / (np.std(layer_expert_importance_norm[:, eid]) + 1e-10)
    
    # 归一化稳定性得分
    expert_stability = expert_stability / np.max(expert_stability)
    
    # 5. 计算层间专家转移矩阵
    transition_matrices = []
    for l in range(num_layers - 1):
        trans_matrix = np.zeros((num_experts, num_experts))
        
        for seq_idx in range(topk_idx.shape[1]):
            for pos in range(topk_idx.shape[2]):
                # 当前层激活的专家
                curr_experts = topk_idx[l, seq_idx, pos]
                curr_scores = np.array([gate_scores[l, seq_idx, pos, e] for e in curr_experts])
                
                # 下一层激活的专家
                next_experts = topk_idx[l+1, seq_idx, pos]
                next_scores = np.array([gate_scores[l+1, seq_idx, pos, e] for e in next_experts])
                
                # 更新转移矩阵
                for i, e1 in enumerate(curr_experts):
                    for j, e2 in enumerate(next_experts):
                        trans_matrix[e1, e2] += curr_scores[i] * next_scores[j]
        
        # 行归一化
        row_sums = trans_matrix.sum(axis=1, keepdims=True)
        norm_matrix = np.divide(trans_matrix, row_sums, where=row_sums!=0)
        
        transition_matrices.append(norm_matrix)
    
    # 保存结果为CSV
    results_df = pd.DataFrame({
        'expert_id': range(num_experts),
        'stability': expert_stability
    })
    
    for layer in range(num_layers):
        results_df[f'layer_{layer}_importance'] = layer_expert_importance_norm[layer]
        results_df[f'layer_{layer}_usage'] = layer_expert_usage_norm[layer]
    
    results_df.to_csv(os.path.join(save_dir, 'expert_evolution_stats.csv'), index=False)
    
    # 返回计算结果
    return {
        'layer_expert_usage': layer_expert_usage,
        'layer_expert_usage_norm': layer_expert_usage_norm,
        'layer_expert_importance': layer_expert_importance,
        'layer_expert_importance_norm': layer_expert_importance_norm,
        'layer_changes_usage': layer_changes_usage,
        'layer_changes_importance': layer_changes_importance,
        'top_experts_per_layer': top_experts_per_layer,
        'expert_stability': expert_stability,
        'transition_matrices': transition_matrices
    }

def visualize_expert_evolution(results, save_dir):
    """可视化专家演化分析的结果"""
    num_layers = results['layer_expert_usage'].shape[0]
    num_experts = results['layer_expert_usage'].shape[1]
    
    # 1. 创建专家使用热图
    plt.figure(figsize=(14, 10))
    plt.subplot(2, 1, 1)
    
    # 自定义配色方案，从浅蓝到深蓝
    cmap = LinearSegmentedColormap.from_list(
        'custom_blues', 
        [(0, '#f7fbff'), (0.2, '#deebf7'), (0.4, '#c6dbef'), 
         (0.6, '#9ecae1'), (0.8, '#6baed6'), (1, '#08519c')], 
        N=256
    )
    
    usage_data = results['layer_expert_usage_norm']
    sns.heatmap(
        usage_data, 
        cmap=cmap,
        annot=False, 
        fmt='.2f',
        linewidths=0.5, 
        cbar_kws={'label': 'Normalized Usage'},
        xticklabels=range(num_experts),
        yticklabels=[f'Layer {i+1}' for i in range(num_layers)]
    )
    
    plt.title('Expert Usage Distribution Across Layers', fontsize=16)
    plt.xlabel('Expert ID', fontsize=14)
    plt.ylabel('', fontsize=14)
    
    # 2. 创建专家重要性热图
    plt.subplot(2, 1, 2)
    importance_data = results['layer_expert_importance_norm']
    
    # 高亮每一层中最重要的专家
    mask = np.zeros_like(importance_data, dtype=bool)
    for layer, top_experts in enumerate(results['top_experts_per_layer']):
        mask[layer, top_experts] = True
    
    # 主热图
    sns.heatmap(
        importance_data, 
        cmap=cmap,
        annot=False, 
        fmt='.2f',
        linewidths=0.5, 
        cbar_kws={'label': 'Normalized Importance'},
        xticklabels=range(num_experts),
        yticklabels=[f'Layer {i+1}' for i in range(num_layers)]
    )
    
    # 添加高亮边框
    for layer in range(num_layers):
        for expert in results['top_experts_per_layer'][layer]:
            plt.gca().add_patch(plt.Rectangle((expert, layer), 1, 1, 
                           fill=False, edgecolor='red', lw=2))
    
    plt.title('Expert Importance Distribution Across Layers', fontsize=16)
    plt.xlabel('Expert ID', fontsize=14)
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'expert_distribution_across_layers.pdf'), format='pdf')
    plt.close()
    
    # 3. 层间变化趋势图
    plt.figure(figsize=(12, 6))
    x = np.arange(1, num_layers)
    
    plt.plot(x, results['layer_changes_usage'], 'o-', linewidth=2, markersize=10, 
             color='#1f77b4', label='Usage Change')
    plt.plot(x, results['layer_changes_importance'], 's-', linewidth=2, markersize=10, 
             color='#ff7f0e', label='Importance Change')
    
    # 添加数值标签
    for i, (u, imp) in enumerate(zip(results['layer_changes_usage'], results['layer_changes_importance'])):
        plt.text(x[i], u+0.02, f"{u:.2f}", ha='center', fontsize=12)
        plt.text(x[i], imp-0.04, f"{imp:.2f}", ha='center', fontsize=12)
    
    plt.xlabel('Layer Transition', fontsize=14)
    plt.ylabel('Distribution Change (L1 Distance)', fontsize=14)
    plt.title('Expert Role Shift Between Consecutive Layers', fontsize=16)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xticks(x, [f'Layer {i} → {i+1}' for i in x])
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'expert_role_shift_between_layers.pdf'), format='pdf')
    plt.close()
    
    # 4. 专家稳定性条形图
    plt.figure(figsize=(14, 7))
    stability = results['expert_stability']
    
    # 根据稳定性得分对专家进行排序
    sorted_indices = np.argsort(stability)[::-1]
    sorted_stability = stability[sorted_indices]
    
    # 创建渐变颜色
    colors = plt.cm.viridis(sorted_stability)
    
    plt.bar(range(num_experts), sorted_stability, color=colors)
    plt.title('Expert Stability Across Layers', fontsize=16)
    plt.xlabel('Expert ID (Sorted by Stability)', fontsize=14)
    plt.ylabel('Normalized Stability Score', fontsize=14)
    plt.xticks(range(num_experts), [f'E{i}' for i in sorted_indices], rotation=90)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # 添加标签
    for i, (idx, stab) in enumerate(zip(sorted_indices, sorted_stability)):
        if stab > 0.5:  # 只标注稳定性较高的专家
            plt.text(i, stab + 0.03, f"E{idx}", ha='center', fontsize=10, weight='bold')
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'expert_stability_across_layers.pdf'), format='pdf')
    plt.close()
    
    # 5. 层间专家转移可视化 (选择前两个层间转移)
    for l in range(min(2, len(results['transition_matrices']))):
        plt.figure(figsize=(12, 10))
        trans_matrix = results['transition_matrices'][l]
        
        # 只显示非零转移
        mask = trans_matrix < 0.05
        
        sns.heatmap(
            trans_matrix, 
            cmap="YlGnBu",
            annot=True, 
            fmt='.2f',
            linewidths=0.5,
            mask=mask,
            cbar_kws={'label': 'Transition Probability'},
            xticklabels=[f'E{i}' for i in range(num_experts)],
            yticklabels=[f'E{i}' for i in range(num_experts)]
        )
        
        plt.title(f'Expert Transition Matrix: Layer {l+1} → Layer {l+2}', fontsize=16)
        plt.xlabel(f'Experts in Layer {l+2}', fontsize=14)
        plt.ylabel(f'Experts in Layer {l+1}', fontsize=14)
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, f'expert_transition_layer{l+1}_to_layer{l+2}.pdf'), format='pdf')
        plt.close()
    
    # 6. 创建专家演化桑基图
    try:
        import plotly.graph_objects as go
        
        # 选择最重要的专家和最重要的转移
        top_k_experts = 10
        
        # 获取每层中最重要的专家
        important_experts = []
        for layer in range(num_layers):
            importance = results['layer_expert_importance_norm'][layer]
            top_indices = np.argsort(importance)[-top_k_experts:]
            important_experts.extend([f"L{layer+1}_E{e}" for e in top_indices])
        
        # 去除重复
        important_experts = list(set(important_experts))
        
        # 创建节点
        nodes = important_experts
        node_color = []
        for node in nodes:
            layer = int(node.split('_')[0][1:]) - 1
            expert = int(node.split('_')[1][1:])
            importance = results['layer_expert_importance_norm'][layer, expert]
            node_color.append(f"rgba(31, 119, 180, {0.3 + 0.7*importance})")
        
        # 创建连接
        links_source = []
        links_target = []
        links_value = []
        links_color = []
        
        for l in range(num_layers - 1):
            trans_matrix = results['transition_matrices'][l]
            
            # 获取重要连接
            for i in range(num_experts):
                for j in range(num_experts):
                    src_node = f"L{l+1}_E{i}"
                    tgt_node = f"L{l+2}_E{j}"
                    
                    if src_node in nodes and tgt_node in nodes:
                        value = trans_matrix[i, j]
                        if value > 0.1:  # 只显示较强的连接
                            src_idx = nodes.index(src_node)
                            tgt_idx = nodes.index(tgt_node)
                            links_source.append(src_idx)
                            links_target.append(tgt_idx)
                            links_value.append(value * 10)  # 放大连接值使其更明显
                            links_color.append(f"rgba(255, 127, 14, {value})")
        
        # 创建桑基图
        fig = go.Figure(data=[go.Sankey(
            node = dict(
                pad = 15,
                thickness = 20,
                line = dict(color = "black", width = 0.5),
                label = nodes,
                color = node_color
            ),
            link = dict(
                source = links_source,
                target = links_target,
                value = links_value,
                color = links_color
            )
        )])
        
        fig.update_layout(
            title_text="Expert Evolution Across Layers (Top Important Experts)",
            font_size=12,
            height=800
        )
        
        fig.write_html(os.path.join(save_dir, 'expert_evolution_sankey.html'))
        
    except ImportError:
        print("plotly 库未安装，跳过桑基图生成")

# 执行分析
print("执行专家演化分析...")
results = analyze_expert_evolution(topk_idx, gate_scores, save_dir)

# 可视化结果
print("可视化分析结果...")
visualize_expert_evolution(results, save_dir)

print(f"分析完成，结果保存在 {save_dir}")

# 生成摘要报告
def generate_summary_report(results, save_dir):
    """生成专家演化分析的摘要报告"""
    num_experts = results['layer_expert_usage'].shape[1]
    num_layers = results['layer_expert_usage'].shape[0]
    
    # 提取关键洞察
    most_important_experts = []
    for layer in range(num_layers):
        importance = results['layer_expert_importance_norm'][layer]
        # 获取前3个最重要的专家
        top3 = np.argsort(importance)[-3:][::-1]
        importance_values = importance[top3]
        most_important_experts.append((layer, top3, importance_values))
    
    # 找出最稳定的专家
    stability = results['expert_stability']
    most_stable = np.argsort(stability)[-5:][::-1]
    
    # 找出角色变化最大的层间转换
    changes = results['layer_changes_importance']
    max_change_idx = np.argmax(changes)
    
    # 生成摘要HTML报告
    html = f"""
    <html>
    <head>
        <title>专家演化分析报告</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            h1 {{ color: #2c3e50; }}
            h2 {{ color: #3498db; }}
            .insight {{ background-color: #f8f9fa; padding: 15px; margin: 10px 0; border-left: 5px solid #3498db; }}
            table {{ border-collapse: collapse; width: 100%; }}
            th, td {{ padding: 8px; text-align: left; border-bottom: 1px solid #ddd; }}
            th {{ background-color: #f2f2f2; }}
            tr:hover {{ background-color: #f5f5f5; }}
        </style>
    </head>
    <body>
        <h1>MOE专家演化分析报告</h1>
        
        <h2>主要发现</h2>
        
        <div class="insight">
            <h3>关键专家</h3>
            <p>各层中最重要的专家:</p>
            <table>
                <tr><th>层</th><th>最重要专家</th><th>重要性值</th></tr>
    """
    
    for layer, top3, values in most_important_experts:
        experts_str = ", ".join([f"Expert {e} ({values[i]:.3f})" for i, e in enumerate(top3)])
        html += f"<tr><td>Layer {layer+1}</td><td>{experts_str}</td><td>{values.sum():.3f}</td></tr>"
    
    html += f"""
            </table>
        </div>
        
        <div class="insight">
            <h3>专家稳定性</h3>
            <p>最稳定的5个专家 (在不同层中保持一致的角色):</p>
            <ul>
    """
    
    for e in most_stable:
        html += f"<li>Expert {e} (稳定性: {stability[e]:.3f})</li>"
    
    html += f"""
            </ul>
        </div>
        
        <div class="insight">
            <h3>层间变化</h3>
            <p>最大的专家角色转变发生在 Layer {max_change_idx+1} → Layer {max_change_idx+2} (变化值: {changes[max_change_idx]:.3f})</p>
        </div>
        
        <h2>可视化</h2>
        <p>详细的可视化结果请查看保存的PDF文件。</p>
        
        <h2>结论</h2>
        <p>MOE模型中的专家在不同层展现出明显的专业化和演化模式。早期层专注于基础特征提取，而较深的层则更多关注高级语义模式。
        某些专家在整个网络中保持稳定的角色，而另一些则在不同层之间转变其功能。</p>
    </body>
    </html>
    """
    
    # 保存HTML报告
    with open(os.path.join(save_dir, 'expert_evolution_report.html'), 'w') as f:
        f.write(html)
    
    print(f"摘要报告已保存至 {os.path.join(save_dir, 'expert_evolution_report.html')}")

# 生成摘要报告
generate_summary_report(results, save_dir)

In [11]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import os

# 创建保存目录
save_dir = "moe_analysis/expert_evolution"
os.makedirs(save_dir, exist_ok=True)

def create_expert_evolution_sankey(topk_idx, gate_scores, save_path):
    """
    创建展示专家演化的桑基图
    修正了层级顺序问题，确保每列只包含对应层的专家
    
    参数:
    topk_idx: 每个位置的top-k专家索引
    gate_scores: 门控分数
    save_path: 保存路径
    """
    num_layers, num_seqs, seq_len, _ = topk_idx.shape
    num_experts = gate_scores.shape[-1]
    
    # 计算每一层中专家的重要性
    layer_expert_importance = np.zeros((num_layers, num_experts))
    
    for layer in range(num_layers):
        for seq_idx in range(topk_idx.shape[1]):
            for pos in range(topk_idx.shape[2]):
                for k in range(topk_idx.shape[3]):
                    eid = topk_idx[layer, seq_idx, pos, k]
                    gate_value = gate_scores[layer, seq_idx, pos, eid]
                    layer_expert_importance[layer, eid] += gate_value
    
    # 归一化重要性
    for layer in range(num_layers):
        total = layer_expert_importance[layer].sum()
        if total > 0:
            layer_expert_importance[layer] /= total
    
    # 选择每一层中的重要专家 (确保每层至少选择6个专家)
    top_k_per_layer = min(6, num_experts)
    selected_experts = []
    
    for layer in range(num_layers):
        # 按重要性降序排列
        indices = np.argsort(layer_expert_importance[layer])[::-1]
        # 选择前top_k_per_layer个
        layer_experts = [f"L{layer+1}_E{idx}" for idx in indices[:top_k_per_layer]]
        selected_experts.append(layer_experts)
    
    # 扁平化为一维列表
    all_experts = []
    for layer_experts in selected_experts:
        all_experts.extend(layer_experts)
    
    # 计算层间转移关系
    links = []
    
    for l in range(num_layers - 1):
        src_experts = selected_experts[l]
        tgt_experts = selected_experts[l+1]
        
        # 创建从当前层到下一层的转移矩阵
        trans_matrix = np.zeros((len(src_experts), len(tgt_experts)))
        
        for seq_idx in range(num_seqs):
            for pos in range(seq_len):
                # 当前层激活的专家及其分数
                curr_experts = [(topk_idx[l, seq_idx, pos, k], 
                                gate_scores[l, seq_idx, pos, topk_idx[l, seq_idx, pos, k]]) 
                               for k in range(topk_idx.shape[3])]
                
                # 下一层激活的专家及其分数
                next_experts = [(topk_idx[l+1, seq_idx, pos, k], 
                               gate_scores[l+1, seq_idx, pos, topk_idx[l+1, seq_idx, pos, k]]) 
                              for k in range(topk_idx.shape[3])]
                
                # 对每对专家，累加转移权重
                for curr_e, curr_score in curr_experts:
                    curr_name = f"L{l+1}_E{curr_e}"
                    if curr_name not in src_experts:
                        continue
                        
                    for next_e, next_score in next_experts:
                        next_name = f"L{l+2}_E{next_e}"
                        if next_name not in tgt_experts:
                            continue
                            
                        # 计算连接权重
                        weight = curr_score * next_score
                        
                        # 更新转移矩阵
                        i = src_experts.index(curr_name)
                        j = tgt_experts.index(next_name)
                        trans_matrix[i, j] += weight
        
        # 将转移矩阵转换为链接列表
        for i, src in enumerate(src_experts):
            for j, tgt in enumerate(tgt_experts):
                # 仅添加有意义的连接
                if trans_matrix[i, j] > 0.01:  # 阈值可调
                    src_idx = all_experts.index(src)
                    tgt_idx = all_experts.index(tgt)
                    # 放大值使其在图中更明显
                    value = float(trans_matrix[i, j] * 100)
                    links.append((src_idx, tgt_idx, value))
    
    # 准备Sankey图的数据
    node_labels = all_experts
    link_sources = [link[0] for link in links]
    link_targets = [link[1] for link in links]
    link_values = [link[2] for link in links]
    
    # 为不同层的节点设置不同的颜色
    node_colors = []
    for node in all_experts:
        layer = int(node.split('_')[0][1:])
        # 使用渐变色方案
        if layer == 1:
            node_colors.append("rgba(31, 119, 180, 0.9)")  # 蓝色
        elif layer == 2:
            node_colors.append("rgba(44, 160, 44, 0.9)")   # 绿色
        elif layer == 3:
            node_colors.append("rgba(214, 39, 40, 0.9)")   # 红色
        else:
            node_colors.append("rgba(148, 103, 189, 0.9)") # 紫色
    
    # 为连接设置渐变颜色
    link_colors = []
    for src, tgt, val in links:
        # 获取源节点和目标节点的层
        src_layer = int(all_experts[src].split('_')[0][1:])
        tgt_layer = int(all_experts[tgt].split('_')[0][1:])
        
        # 基于值的大小设置透明度和颜色强度
        strength = min(1.0, val / 50)  # 归一化到0-1范围
        
        # 使用从金色到橙色的渐变
        if strength > 0.7:
            link_colors.append(f"rgba(255, 165, 0, {0.7 + 0.3*strength})")  # 橙色，强连接
        elif strength > 0.4:
            link_colors.append(f"rgba(255, 140, 0, {0.5 + 0.3*strength})")  # 深橙色，中等连接
        else:
            link_colors.append(f"rgba(210, 105, 30, {0.3 + 0.3*strength})")  # 棕色，弱连接
    
    # 创建Sankey图
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = node_labels,
            color = node_colors
        ),
        link = dict(
            source = link_sources,
            target = link_targets,
            value = link_values,
            color = link_colors
        )
    )])
    
    # 调整布局
    fig.update_layout(
        title_text="Expert Evolution Across Layers",
        font_size=12,
        height=800,
        paper_bgcolor='rgba(240, 240, 240, 0.9)',  # 浅灰背景
        plot_bgcolor='rgba(240, 240, 240, 0.9)'
    )
    
    # 保存为HTML
    fig.write_html(save_path)
    
    return fig

# 假设这些是从之前加载的数据
# topk_idx = np.load("topk_idx.npy")
# gate_scores = np.load("gate_scores.npy")

# 使用示例
# create_expert_evolution_sankey(
#     topk_idx, 
#     gate_scores, 
#     os.path.join(save_dir, 'improved_expert_evolution_sankey.html')
# )

def run_sankey_analysis(topk_idx_path, gate_scores_path, save_dir):
    """完整的桑基图分析流程"""
    print("加载数据中...")
    topk_idx = np.load(topk_idx_path)  
    gate_scores = np.load(gate_scores_path)
    
    print("创建专家演化桑基图...")
    create_expert_evolution_sankey(
        topk_idx,
        gate_scores,
        os.path.join(save_dir, 'expert_evolution_sankey.html')
    )
    
    print(f"分析完成，结果保存在 {save_dir}")

# 调用示例
run_sankey_analysis(
     "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/topk_idx.npy",
     "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/gate_scores.npy",
     "/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis/expert_evolution"
)

In [2]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import os

# 创建保存目录
save_dir = "moe_analysis/expert_evolution"
os.makedirs(save_dir, exist_ok=True)

def create_expert_evolution_sankey(topk_idx, gate_scores, token_types, save_path):
    """
    创建展示专家演化的桑基图，正确处理padding
    
    参数:
    topk_idx: 每个位置的top-k专家索引
    gate_scores: 门控分数
    token_types: 序列中的token类型，padding处为特殊值
    save_path: 保存路径
    """
    num_layers, num_seqs, seq_len, _ = topk_idx.shape
    num_experts = gate_scores.shape[-1]
    
    # 首先确定padding的标识符 - 通常是-1，但也可能是其他特殊值
    # 我们通过检查每个序列的填充模式来确定
    padding_value = -1  # 默认假设是-1
    
    # 查看一些样本，判断padding值
    for seq_idx in range(min(10, num_seqs)):
        # 查看序列末尾的值
        end_values = token_types[seq_idx, -10:]
        # 如果末尾有连续相同的值，可能是padding
        if len(set(end_values)) == 1 and end_values[0] != -1:
            padding_value = end_values[0]
            print(f"检测到可能的padding值: {padding_value}")
            break
    
    print(f"使用 {padding_value} 作为padding标识符")
    
    # 计算每一层中专家的重要性，跳过padding位置
    layer_expert_importance = np.zeros((num_layers, num_experts))
    
    for layer in range(num_layers):
        for seq_idx in range(topk_idx.shape[1]):
            # 确定该序列的有效长度（非padding部分）
            valid_mask = token_types[seq_idx] != padding_value
            valid_len = np.sum(valid_mask)
            
            if valid_len == 0:
                continue  # 跳过完全是padding的序列
                
            for pos in range(seq_len):
                # 只处理非padding位置
                if not valid_mask[pos]:
                    continue
                    
                for k in range(topk_idx.shape[3]):
                    eid = topk_idx[layer, seq_idx, pos, k]
                    gate_value = gate_scores[layer, seq_idx, pos, eid]
                    layer_expert_importance[layer, eid] += gate_value
    
    # 归一化重要性
    for layer in range(num_layers):
        total = layer_expert_importance[layer].sum()
        if total > 0:
            layer_expert_importance[layer] /= total
    
    # 选择每一层中的重要专家 (确保每层至少选择6个专家)
    top_k_per_layer = min(6, num_experts)
    selected_experts = []
    
    for layer in range(num_layers):
        # 按重要性降序排列
        indices = np.argsort(layer_expert_importance[layer])[::-1]
        # 选择前top_k_per_layer个
        layer_experts = [f"L{layer+1}_E{idx}" for idx in indices[:top_k_per_layer]]
        selected_experts.append(layer_experts)
    
    # 扁平化为一维列表
    all_experts = []
    for layer_experts in selected_experts:
        all_experts.extend(layer_experts)
    
    # 计算层间转移关系，跳过padding位置
    links = []
    
    for l in range(num_layers - 1):
        src_experts = selected_experts[l]
        tgt_experts = selected_experts[l+1]
        
        # 创建从当前层到下一层的转移矩阵
        trans_matrix = np.zeros((len(src_experts), len(tgt_experts)))
        
        for seq_idx in range(num_seqs):
            # 确定该序列的有效长度（非padding部分）
            valid_mask = token_types[seq_idx] != padding_value
            
            for pos in range(seq_len):
                # 只处理非padding位置
                if not valid_mask[pos]:
                    continue
                
                # 当前层激活的专家及其分数
                curr_experts = [(topk_idx[l, seq_idx, pos, k], 
                                gate_scores[l, seq_idx, pos, topk_idx[l, seq_idx, pos, k]]) 
                               for k in range(topk_idx.shape[3])]
                
                # 下一层激活的专家及其分数
                next_experts = [(topk_idx[l+1, seq_idx, pos, k], 
                               gate_scores[l+1, seq_idx, pos, topk_idx[l+1, seq_idx, pos, k]]) 
                              for k in range(topk_idx.shape[3])]
                
                # 对每对专家，累加转移权重
                for curr_e, curr_score in curr_experts:
                    curr_name = f"L{l+1}_E{curr_e}"
                    if curr_name not in src_experts:
                        continue
                        
                    for next_e, next_score in next_experts:
                        next_name = f"L{l+2}_E{next_e}"
                        if next_name not in tgt_experts:
                            continue
                            
                        # 计算连接权重
                        weight = curr_score * next_score
                        
                        # 更新转移矩阵
                        i = src_experts.index(curr_name)
                        j = tgt_experts.index(next_name)
                        trans_matrix[i, j] += weight
        
        # 将转移矩阵转换为链接列表
        for i, src in enumerate(src_experts):
            for j, tgt in enumerate(tgt_experts):
                # 仅添加有意义的连接
                if trans_matrix[i, j] > 0.01:  # 阈值可调
                    src_idx = all_experts.index(src)
                    tgt_idx = all_experts.index(tgt)
                    # 放大值使其在图中更明显
                    value = float(trans_matrix[i, j] * 100)
                    links.append((src_idx, tgt_idx, value))
    
    # 准备Sankey图的数据
    node_labels = all_experts
    link_sources = [link[0] for link in links]
    link_targets = [link[1] for link in links]
    link_values = [link[2] for link in links]
    
    # 为不同层的节点设置不同的颜色
    node_colors = []
    for node in all_experts:
        layer = int(node.split('_')[0][1:])
        # 使用渐变色方案
        if layer == 1:
            node_colors.append("rgba(31, 119, 180, 0.9)")  # 蓝色
        elif layer == 2:
            node_colors.append("rgba(44, 160, 44, 0.9)")   # 绿色
        elif layer == 3:
            node_colors.append("rgba(214, 39, 40, 0.9)")   # 红色
        else:
            node_colors.append("rgba(148, 103, 189, 0.9)") # 紫色
    
    # 为连接设置渐变颜色
    link_colors = []
    for src, tgt, val in links:
        # 获取源节点和目标节点的层
        src_layer = int(all_experts[src].split('_')[0][1:])
        tgt_layer = int(all_experts[tgt].split('_')[0][1:])
        
        # 基于值的大小设置透明度和颜色强度
        strength = min(1.0, val / 50)  # 归一化到0-1范围
        
        # 使用从金色到橙色的渐变
        if strength > 0.7:
            link_colors.append(f"rgba(255, 165, 0, {0.7 + 0.3*strength})")  # 橙色，强连接
        elif strength > 0.4:
            link_colors.append(f"rgba(255, 140, 0, {0.5 + 0.3*strength})")  # 深橙色，中等连接
        else:
            link_colors.append(f"rgba(210, 105, 30, {0.3 + 0.3*strength})")  # 棕色，弱连接
    
    # 创建Sankey图
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = node_labels,
            color = node_colors
        ),
        link = dict(
            source = link_sources,
            target = link_targets,
            value = link_values,
            color = link_colors
        )
    )])
    
    # 调整布局
    fig.update_layout(
        title_text="Expert Evolution Across Layers",
        font_size=12,
        height=800,
        paper_bgcolor='rgba(240, 240, 240, 0.9)',  # 浅灰背景
        plot_bgcolor='rgba(240, 240, 240, 0.9)'
    )
    
    # 保存为HTML
    fig.write_html(save_path)
    
    return fig

def run_sankey_analysis(topk_idx_path, gate_scores_path, token_types_path, save_dir):
    """完整的桑基图分析流程"""
    print("加载数据中...")
    topk_idx = np.load(topk_idx_path)  
    gate_scores = np.load(gate_scores_path)
    token_types = np.load(token_types_path)
    
    print(f"数据形状: topk_idx={topk_idx.shape}, gate_scores={gate_scores.shape}, token_types={token_types.shape}")
    
    print("创建专家演化桑基图...")
    create_expert_evolution_sankey(
        topk_idx,
        gate_scores,
        token_types,
        os.path.join(save_dir, 'expert_evolution_sankey.html')
    )
    
    print(f"分析完成，结果保存在 {save_dir}")

# 主函数调用
if __name__ == "__main__":
    run_sankey_analysis(
        "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/topk_idx.npy",
        "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/gate_scores.npy",
        "/exp_data/sjx/star/main_transformer_moe_weight/experiment_data/test_token_types.npy",
        "/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis/expert_evolution"
    )