In [1]:
from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM
import torch

In [10]:
def list_linear_suffixes(model):
    """列出模型所有 Linear 层的去重后末级名称"""
    suffixes = set()  # 用 set 去重
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            suffixes.add(name.split(".")[-1])
    return sorted(list(suffixes))  # 排序方便查看


In [19]:
qwen3 = AutoModel.from_pretrained("../../llms/Qwen/Qwen3-0.6B")
print(list_linear_suffixes(qwen3))
print(qwen3)

['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
Qwen3Model(
  (embed_tokens): Embedding(151936, 1024)
  (layers): ModuleList(
    (0-27): 28 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
        (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
      (post_atte

In [18]:
minicpm4 = AutoModel.from_pretrained("../../llms/openbmb/MiniCPM4-0.5B", trust_remote_code=True)
print(list_linear_suffixes(minicpm4))
print(minicpm4)

['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
MiniCPMModel(
  (embed_tokens): Embedding(73448, 1024)
  (layers): ModuleList(
    (0-23): 24 x MiniCPMDecoderLayer(
      (self_attn): MiniCPMFlashAttention2(
        (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (k_proj): Linear(in_features=1024, out_features=128, bias=False)
        (v_proj): Linear(in_features=1024, out_features=128, bias=False)
        (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (rotary_emb): MiniCPMLongRoPE()
      )
      (mlp): MiniCPMMLP(
        (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
        (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
        (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): MiniCPMRMSNorm()
      (post_attention_layernorm): MiniCPMRMSNorm()
    )
  )
  (norm): MiniCPMRMS

In [17]:
mle5 = AutoModel.from_pretrained("../../llms/intfloat/multilingual-e5-large", trust_remote_code=True)
print(list_linear_suffixes(mle5))
print(mle5)

['dense', 'key', 'query', 'value']
XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): 