In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

# Load the model in FP16 and move it to CUDA
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

# Ensure the model is on the correct device
model.to(device)

print(f"Model loaded on: {device}")
print(f"Model dtype: {model.dtype}")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.
You shouldn't move a model that is dispatched using accelerate hooks.


Model loaded on: cuda
Model dtype: torch.float16


In [10]:
import torch
from torch import nn
from typing import Dict
import re

def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters())

def analyze_model_layers(model: nn.Module) -> Dict[str, int]:
    layer_params: Dict[str, int] = {}
    for name, module in model.named_modules():
        if len(list(module.children())) == 0:  # If it's a leaf module
            num_params = sum(p.numel() for p in module.parameters())
            if num_params > 0:
                layer_params[name] = num_params
    return layer_params

def extract_layer_id(layer_name: str) -> tuple:
    match = re.search(r'layers\.(\d+)', layer_name)
    if match:
        return (int(match.group(1)), layer_name)
    return (-1, layer_name)  # For non-numbered layers

# Assuming 'model' is already loaded and available in your notebook
# Print basic model info
print(f"Model is on device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

# Count and print the total number of parameters
total_params: int = count_parameters(model)
print(f"Total number of parameters: {total_params:,}")

# Analyze and print the number of parameters in each layer
print("\nAnalyzing model layers...")
layer_params = analyze_model_layers(model)

# Sort layers by layer ID and then by name
sorted_layers = sorted(layer_params.items(), key=lambda x: extract_layer_id(x[0]))

# Print sorted layers
for layer_name, num_params in sorted_layers:
    print(f"{layer_name}: {num_params:,} parameters")

# Print summary of top-level modules
print("\nSummary of top-level modules:")
for name, module in model.named_children():
    num_params = sum(p.numel() for p in module.parameters())
    print(f"{name}: {num_params:,} parameters")

Model is on device: cuda:0
Model dtype: torch.float16
Total number of parameters: 8,030,261,248

Analyzing model layers...
lm_head: 525,336,576 parameters
model.embed_tokens: 525,336,576 parameters
model.norm: 4,096 parameters
model.layers.0.input_layernorm: 4,096 parameters
model.layers.0.mlp.down_proj: 58,720,256 parameters
model.layers.0.mlp.gate_proj: 58,720,256 parameters
model.layers.0.mlp.up_proj: 58,720,256 parameters
model.layers.0.post_attention_layernorm: 4,096 parameters
model.layers.0.self_attn.k_proj: 4,194,304 parameters
model.layers.0.self_attn.o_proj: 16,777,216 parameters
model.layers.0.self_attn.q_proj: 16,777,216 parameters
model.layers.0.self_attn.v_proj: 4,194,304 parameters
model.layers.1.input_layernorm: 4,096 parameters
model.layers.1.mlp.down_proj: 58,720,256 parameters
model.layers.1.mlp.gate_proj: 58,720,256 parameters
model.layers.1.mlp.up_proj: 58,720,256 parameters
model.layers.1.post_attention_layernorm: 4,096 parameters
model.layers.1.self_attn.k_proj: 