# Neural Network Architecture Visualization

**Duration:** ~40 min | **Platform:** Kaggle dual Tesla T4

This notebook explores **GGUF model internals** — parsing architecture metadata,
analyzing layer structures, visualizing weight distributions, and comparing
model architectures using Graphistry.

### What you'll learn
1. Parse GGUF metadata and architecture details
2. Analyze layer types and parameter counts
3. Visualize weight distributions
4. Build architecture graphs with Graphistry
5. Compare different model architectures

In [None]:
!pip install -q git+https://github.com/llamatelemetry/llamatelemetry.git@v1.0.0
!pip install -q matplotlib

import llamatelemetry
llamatelemetry.init(service_name="arch-explorer")

## Parsing GGUF Metadata

Read architecture details directly from the GGUF file header without loading
the full model into memory.

In [None]:
from huggingface_hub import hf_hub_download
from llamatelemetry.llama import parse_gguf_header

model_path = hf_hub_download(
    repo_id="bartowski/google_gemma-3-1b-it-GGUF",
    filename="google_gemma-3-1b-it-Q4_K_M.gguf",
    cache_dir="/root/.cache/huggingface",
)

info = parse_gguf_header(model_path, read_tensors=True)

print(f"=== {info.file} ===")
print(f"Size: {info.size_mb:.1f} MB")
print(f"Architecture: {info.metadata.architecture}")
print(f"Layers (blocks): {info.metadata.block_count}")
print(f"Embedding dim: {info.metadata.embedding_length}")
print(f"Context length: {info.metadata.context_length}")
print(f"Total tensors: {len(info.tensors)}")

## Layer Analysis

Categorize tensors by type and analyze parameter counts per layer.

In [None]:
from collections import defaultdict
import numpy as np

@llamatelemetry.task(name="analyze-layers")
def analyze_layers(tensors):
    """Categorize tensors by type and compute statistics."""
    categories = defaultdict(list)

    for t in tensors:
        name = t.name
        params = 1
        for dim in t.shape:
            params *= dim

        # Categorize by tensor name pattern
        if "attn" in name or "q_proj" in name or "k_proj" in name or "v_proj" in name or "o_proj" in name:
            categories["attention"].append((name, params, t.dtype))
        elif "ffn" in name or "gate_proj" in name or "up_proj" in name or "down_proj" in name:
            categories["feed_forward"].append((name, params, t.dtype))
        elif "norm" in name:
            categories["normalization"].append((name, params, t.dtype))
        elif "embed" in name or "token" in name:
            categories["embedding"].append((name, params, t.dtype))
        elif "output" in name:
            categories["output"].append((name, params, t.dtype))
        else:
            categories["other"].append((name, params, t.dtype))

    # Print summary
    total_params = sum(p for cat in categories.values() for _, p, _ in cat)
    print(f"\n{'Category':<18} {'Tensors':<10} {'Params':<15} {'% of Total'}")
    print("-" * 55)
    for cat, items in sorted(categories.items(), key=lambda x: -sum(p for _, p, _ in x[1])):
        cat_params = sum(p for _, p, _ in items)
        pct = cat_params / total_params * 100 if total_params > 0 else 0
        print(f"{cat:<18} {len(items):<10} {cat_params:<15,} {pct:.1f}%")

    return categories

categories = analyze_layers(info.tensors)

## Weight Distribution

Visualize the distribution of tensor sizes across different layer types.

In [None]:
import matplotlib.pyplot as plt

# Parameter counts per category
cat_names = []
cat_params = []
for cat, items in sorted(categories.items(), key=lambda x: -sum(p for _, p, _ in x[1])):
    cat_names.append(cat)
    cat_params.append(sum(p for _, p, _ in items))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart of parameter distribution
axes[0].pie(cat_params, labels=cat_names, autopct="%1.1f%%", startangle=90)
axes[0].set_title("Parameter Distribution by Layer Type")

# Bar chart of tensor counts
cat_counts = [len(categories[c]) for c in cat_names]
axes[1].barh(cat_names, cat_counts, color="steelblue")
axes[1].set_xlabel("Number of Tensors")
axes[1].set_title("Tensor Count by Layer Type")

plt.tight_layout()
plt.show()

## Architecture as Graph

Build a graph representing the model architecture, where nodes are layers
and edges represent data flow.

In [None]:
import pandas as pd
from llamatelemetry.kaggle import rapids_gpu

with rapids_gpu(1):
    # Build architecture graph
    n_blocks = info.metadata.block_count or 1

    nodes = [{"id": "input", "type": "embedding", "layer": -1}]
    edges = []

    for i in range(n_blocks):
        block_nodes = [
            {"id": f"attn_{i}", "type": "attention", "layer": i},
            {"id": f"norm_attn_{i}", "type": "normalization", "layer": i},
            {"id": f"ffn_{i}", "type": "feed_forward", "layer": i},
            {"id": f"norm_ffn_{i}", "type": "normalization", "layer": i},
        ]
        nodes.extend(block_nodes)

        prev = "input" if i == 0 else f"norm_ffn_{i-1}"
        edges.extend([
            {"src": prev, "dst": f"norm_attn_{i}", "type": "residual"},
            {"src": f"norm_attn_{i}", "dst": f"attn_{i}", "type": "forward"},
            {"src": f"attn_{i}", "dst": f"norm_ffn_{i}", "type": "residual"},
            {"src": f"norm_ffn_{i}", "dst": f"ffn_{i}", "type": "forward"},
        ])

    nodes.append({"id": "output", "type": "output", "layer": n_blocks})
    edges.append({"src": f"norm_ffn_{n_blocks-1}" if n_blocks > 0 else "input", "dst": "output", "type": "forward"})

    node_df = pd.DataFrame(nodes)
    edge_df = pd.DataFrame(edges)

    print(f"Architecture graph: {len(node_df)} nodes, {len(edge_df)} edges")
    print(f"Blocks: {n_blocks}")

    try:
        import graphistry
        g = (graphistry
             .edges(edge_df, "src", "dst")
             .nodes(node_df, "id")
             .bind(point_title="id", edge_title="type")
             .encode_point_color("type", categorical_mapping={
                 "embedding": "blue", "attention": "red",
                 "feed_forward": "green", "normalization": "gray",
                 "output": "orange",
             }))
        g.plot()
    except Exception as e:
        print(f"Graphistry: {e}")
        print("\nArchitecture (first 3 blocks):")
        for _, row in edge_df.head(12).iterrows():
            print(f"  {row['src']} → {row['dst']} ({row['type']})")

## Attention Mechanism Structure

Analyze the attention head configuration from GGUF metadata.

In [None]:
meta = info.metadata

# Attention configuration
attn_tensors = [t for t in info.tensors if any(k in t.name for k in ["q_proj", "k_proj", "v_proj", "o_proj"])]

print("Attention Configuration:")
print(f"  Embedding dim: {meta.embedding_length}")
print(f"  Layers: {meta.block_count}")
print(f"  Attention tensors: {len(attn_tensors)}")

# Analyze Q/K/V shapes
print(f"\nAttention tensor shapes (first block):")
for t in attn_tensors[:4]:
    print(f"  {t.name}: {t.shape} ({t.dtype})")

# Estimate head count from Q projection shape
q_tensors = [t for t in attn_tensors if "q_proj" in t.name]
if q_tensors:
    q_shape = q_tensors[0].shape
    embed_dim = meta.embedding_length or q_shape[-1]
    # Head dim is typically 64 or 128
    for head_dim in [64, 128]:
        n_heads = embed_dim // head_dim
        if n_heads > 0:
            print(f"\n  Estimated heads (dim={head_dim}): {n_heads}")

## Comparing Architectures

Download and compare two different model architectures side-by-side.

In [None]:
# Compare two models (using same model with different quants as demo)
model_q5_path = hf_hub_download(
    repo_id="bartowski/google_gemma-3-1b-it-GGUF",
    filename="google_gemma-3-1b-it-Q5_K_M.gguf",
    cache_dir="/root/.cache/huggingface",
)

info_q4 = parse_gguf_header(model_path, read_tensors=True)
info_q5 = parse_gguf_header(model_q5_path, read_tensors=True)

print(f"{'Property':<25} {'Q4_K_M':<20} {'Q5_K_M'}")
print("-" * 60)
print(f"{'File size (MB)':<25} {info_q4.size_mb:<20.1f} {info_q5.size_mb:.1f}")
print(f"{'Architecture':<25} {str(info_q4.metadata.architecture):<20} {info_q5.metadata.architecture}")
print(f"{'Layers':<25} {str(info_q4.metadata.block_count):<20} {info_q5.metadata.block_count}")
print(f"{'Embedding dim':<25} {str(info_q4.metadata.embedding_length):<20} {info_q5.metadata.embedding_length}")
print(f"{'Context length':<25} {str(info_q4.metadata.context_length):<20} {info_q5.metadata.context_length}")
print(f"{'Tensor count':<25} {len(info_q4.tensors):<20} {len(info_q5.tensors)}")
print(f"{'Size ratio':<25} {'1.00x':<20} {info_q5.size_mb/info_q4.size_mb:.2f}x")

## Summary

This notebook demonstrated:
- **GGUF parsing** without loading model weights into GPU memory
- **Layer analysis** with automatic categorization
- **Architecture graphs** rendered with Graphistry
- **Model comparison** across quantization levels

These tools help you understand model internals before deploying on constrained hardware.

In [None]:
llamatelemetry.shutdown()
print("Done.")