# Utils Functions

In [7]:
from ipywidgets import widgets
from IPython.display import display
from pprint import pprint


def show_scrollable(content, height="300px"):
    """
    Display content in a scrollable box in Jupyter Notebook.

    Args:
        content: Content to display (string, list, dict, or any printable object)
        height: Height of the scrollable box (e.g., '300px', '500px', '50%')

    Examples:
        >>> # Display a long list
        >>> show_scrollable([f"Item {i}" for i in range(100)])

        >>> # Display a dictionary with custom height
        >>> big_dict = {i: f"Value {i}" for i in range(50)}
        >>> show_scrollable(big_dict, height='400px')

        >>> # Display string output
        >>> show_scrollable("Lorem ipsum...\\n" * 50)
    """
    out = widgets.Output(
        layout={
            "height": height,
            "overflow": "auto",
            "border": "1px solid #ddd",
            "padding": "5px",
        }
    )
    display(out)

    with out:
        if isinstance(content, (list, dict)):
            pprint(content)
        else:
            print(content) if content is not None else print("None")

# Inspecting DS R1 Qwen Model

In [1]:
from transformers import AutoModelForCausalLM, AutoConfig

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

print(config)  # Full architecture details

  from .autonotebook import tqdm as notebook_tqdm


Qwen2Config {
  "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 131072,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.0",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}



In [3]:
from transformers import AutoModelForCausalLM
from torchinfo import summary
import torch

# Load models
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

summary(model, input_size=(1, 128), dtypes=[torch.int64])

Layer (type:depth-idx)                             Output Shape              Param #
Qwen2ForCausalLM                                   [1, 2, 128, 128]          --
├─Qwen2Model: 1-1                                  [1, 2, 128, 128]          --
│    └─Embedding: 2-1                              [1, 128, 1536]            233,373,696
│    └─Qwen2RotaryEmbedding: 2-2                   [1, 128, 128]             --
│    └─ModuleList: 2-3                             --                        --
│    │    └─Qwen2DecoderLayer: 3-1                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-2                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-3                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-4                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-5                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-6                 [1, 128, 1536] 

In [1]:
from transformers import AutoModelForCausalLM

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

for name, module in model.named_modules():
    # if "layers" in name and "proj" in name:
    print(name, module)

 Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Q

In [None]:
from transformers import AutoModelForCausalLM, AutoConfig

original_config = AutoConfig.from_pretrained("Qwen/Qwen1.5-1.8B")
print(original_config)  # Compare hidden_size, layers, etc.

Qwen2Config {
  "_name_or_path": "Qwen/Qwen1.5-1.8B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

