In [17]:
import torch
import pandas as pd
import numpy as np
from transformers import GPT2LMHeadModel

# Load model
print("üöÄ Loading GPT-2 (small) model...")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

def show_statistics(model):
    records = []

    print("üîç Computing statistics for each parameter...")
    for name, param in model.named_parameters():
        if param.numel() == 0:
            continue
        records.append({
            "parameter_name": name,
            "min": param.min().item(),
            "max": param.max().item(),
            "avg": param.mean().item(),
            "std": param.std().item(),
            "numel": param.numel()
        })

    # Create DataFrame
    df = pd.DataFrame(records)

    # Reorder columns for readability
    df = df[["parameter_name", "min", "max", "avg", "std", "numel"]]

    # Pretty-print table (same format as requested)
    print("\n" + "="*105)
    print(f"{'Parameter Name':<45} | {'Min':>10} | {'Max':>10} | {'Avg':>10} | {'Std':>10} | {'#Params':>8}")
    print("-"*105)

    for _, row in df.iterrows():
        print(
            f"{row['parameter_name']:<45} | "
            f"{row['min']:10.4e} | "
            f"{row['max']:10.4e} | "
            f"{row['avg']:10.4e} | "
            f"{row['std']:10.4e} | "
            f"{row['numel']:8,}"
        )

    print("-"*105)
    print(f"‚úÖ Total parameters: {df['numel'].sum():,}")
    print(f"‚úÖ DataFrame shape: {df.shape} (rows=parameters, cols=stats)")

    # Optional: Save to file
    # df.to_csv("gpt2_weight_stats.csv", index=False)
    # df.to_excel("gpt2_weight_stats.xlsx", index=False)

    # ‚úÖ Now `df` is ready for analysis!
    # Example queries:
    print("\nüí° Example usage:")
    print(f"- Mean std across all params: {df['std'].mean():.4f}")
    print(f"- Layer with largest weight magnitude (max |max|):")
    abs_max_row = df.loc[df[['min', 'max']].abs().max(axis=1).idxmax()]
    print(f"  ‚Üí {abs_max_row['parameter_name']} (max={abs_max_row['max']:.4f}, min={abs_max_row['min']:.4f})")

    # Bonus: Add layer-level grouping (e.g., 'h.0', 'h.1', etc.)
    def extract_layer_group(name):
        if name.startswith("transformer.h."):
            # e.g., 'transformer.h.5.mlp.c_proj.weight' ‚Üí 'h.5'
            return ".".join(name.split(".")[1:3])  # ['transformer', 'h', '5', ...] ‚Üí 'h.5'
        elif name.startswith("transformer.wte") or name.startswith("transformer.wpe"):
            return "embeddings"
        elif name.startswith("transformer.ln_f"):
            return "final_ln"
        elif name == "lm_head.weight":
            return "lm_head"
        else:
            return "other"

    df["layer_group"] = df["parameter_name"].apply(extract_layer_group)

    # Optional: Per-layer-group summary (mean of stats weighted by numel)
    print("\nüìä Per-layer-group aggregated stats (numel-weighted avg):")
    layer_summary = df.groupby("layer_group").apply(
        lambda g: pd.Series({
            "total_params": g["numel"].sum(),
            "avg_min": np.average(g["min"], weights=g["numel"]),
            "avg_max": np.average(g["max"], weights=g["numel"]),
            "avg_avg": np.average(g["avg"], weights=g["numel"]),
            "avg_std": np.average(g["std"], weights=g["numel"]),
        })
    ).round(6)

    print(layer_summary.to_string(float_format="{:.6e}".format))
    return df

üöÄ Loading GPT-2 (small) model...


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-4B-Instruct-2507"
qwen_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    device_map="auto"
)
qwen_model.eval()



Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:02<00:00,  1.06it/s]


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
        (post_attention_layer

In [14]:
    records = []

    print("üîç Computing statistics for each parameter...")
    for name, param in qwen_model.named_parameters():
        if param.numel() == 0:
            continue
        
        records.append({
            "parameter_name": name,
            "min": param.min().item(),
            "max": param.max().item(),
            "avg": param.mean().item(),
            "std": param.std().item(),
            "numel": param.numel()
        })
        break
    records

üîç Computing statistics for each parameter...


[{'parameter_name': 'model.embed_tokens.weight',
  'min': -0.220703125,
  'max': 0.24609375,
  'avg': -2.6464462280273438e-05,
  'std': 0.021728515625,
  'numel': 388956160}]

In [18]:
show_statistics(qwen_model)

üîç Computing statistics for each parameter...

Parameter Name                                |        Min |        Max |        Avg |        Std |  #Params
---------------------------------------------------------------------------------------------------------
model.embed_tokens.weight                     | -2.2070e-01 | 2.4609e-01 | -2.6464e-05 | 2.1729e-02 | 388,956,160
model.layers.0.self_attn.q_proj.weight        | -5.8984e-01 | 4.3945e-01 | 6.7651e-06 | 2.2949e-02 | 10,485,760
model.layers.0.self_attn.k_proj.weight        | -2.9297e-01 | 2.4805e-01 | 1.1921e-05 | 2.4170e-02 | 2,621,440
model.layers.0.self_attn.v_proj.weight        | -1.6895e-01 | 1.4648e-01 | -9.9540e-06 | 2.2705e-02 | 2,621,440
model.layers.0.self_attn.o_proj.weight        | -5.1172e-01 | 5.1172e-01 | -6.7055e-06 | 2.1362e-02 | 10,485,760
model.layers.0.self_attn.q_norm.weight        | -7.2021e-03 | 3.7500e+00 | 1.7891e+00 | 6.0156e-01 |      128
model.layers.0.self_attn.k_norm.weight        | -1.6357e-02 | 4.

  layer_summary = df.groupby("layer_group").apply(


Unnamed: 0,parameter_name,min,max,avg,std,numel,layer_group
0,model.embed_tokens.weight,-0.220703,0.246094,-2.646446e-05,0.021729,388956160,other
1,model.layers.0.self_attn.q_proj.weight,-0.589844,0.439453,6.765127e-06,0.022949,10485760,other
2,model.layers.0.self_attn.k_proj.weight,-0.292969,0.248047,1.192093e-05,0.024170,2621440,other
3,model.layers.0.self_attn.v_proj.weight,-0.168945,0.146484,-9.953976e-06,0.022705,2621440,other
4,model.layers.0.self_attn.o_proj.weight,-0.511719,0.511719,-6.705523e-06,0.021362,10485760,other
...,...,...,...,...,...,...,...
393,model.layers.35.mlp.up_proj.weight,-0.828125,0.613281,8.046627e-06,0.025513,24903680,other
394,model.layers.35.mlp.down_proj.weight,-0.691406,0.785156,-8.419156e-07,0.022339,24903680,other
395,model.layers.35.input_layernorm.weight,0.211914,17.375000,3.921875e+00,1.187500,2560,other
396,model.layers.35.post_attention_layernorm.weight,0.002991,23.875000,1.859375e+00,0.632812,2560,other
