In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel, AutoModelForSequenceClassification
from dotenv import load_dotenv
import os
load_dotenv()
from torchinfo import summary
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_name = "speakleash/Bielik-1.5B-v3"

tokenizer = AutoTokenizer.from_pretrained(model_name)




In [6]:
model_2 = AutoModel.from_pretrained(model_name)
model_2

LlamaModel(
  (embed_tokens): Embedding(32000, 1536)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
        (k_proj): Linear(in_features=1536, out_features=256, bias=True)
        (v_proj): Linear(in_features=1536, out_features=256, bias=True)
        (o_proj): Linear(in_features=1536, out_features=1536, bias=True)
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=1536, out_features=8960, bias=True)
        (up_proj): Linear(in_features=1536, out_features=8960, bias=True)
        (down_proj): Linear(in_features=8960, out_features=1536, bias=True)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): LlamaRMSNorm((1536,), eps=1e-06)
      (post_attention_layernorm): LlamaRMSNorm((1536,), eps=1e-06)
    )
  )
  (norm): LlamaRMSNorm((1536,), eps=1e-06)
  (rotary_emb): LlamaRotaryEmbedding()
)

In [7]:
dummy_input = torch.randint(0, tokenizer.vocab_size, (1, 256))
summary(model_2, input_data=dummy_input)

Layer (type:depth-idx)                        Output Shape              Param #
LlamaModel                                    --                        --
├─Embedding: 1-1                              [1, 256, 1536]            49,152,000
├─LlamaRotaryEmbedding: 1-2                   [1, 256, 128]             --
├─ModuleList: 1-3                             --                        --
│    └─LlamaDecoderLayer: 2-1                 [1, 256, 1536]            --
│    │    └─LlamaRMSNorm: 3-1                 [1, 256, 1536]            1,536
│    │    └─LlamaAttention: 3-2               [1, 256, 1536]            5,508,608
│    │    └─LlamaRMSNorm: 3-3                 [1, 256, 1536]            1,536
│    │    └─LlamaMLP: 3-4                     [1, 256, 1536]            41,307,136
│    └─LlamaDecoderLayer: 2-2                 [1, 256, 1536]            --
│    │    └─LlamaRMSNorm: 3-5                 [1, 256, 1536]            1,536
│    │    └─LlamaAttention: 3-6               [1, 256, 1536]   

In [8]:
output = model_2(dummy_input)

In [9]:
output

BaseModelOutputWithPast(last_hidden_state=tensor([[[ 1.6982, -3.0423,  0.2673,  ...,  0.5953, -1.4663, -0.2163],
         [ 0.5338, -2.1484, -0.4599,  ..., -2.8242, -4.7383, -0.2091],
         [ 1.3170, -0.1391, -1.6626,  ...,  0.5867,  1.2374, -0.4145],
         ...,
         [ 0.5909, -0.8897, -1.3753,  ..., -1.4780, -1.4284, -0.1146],
         [ 0.3961, -0.8226, -1.4473,  ..., -0.8169, -0.7911,  0.1908],
         [ 0.2619, -0.6083, -2.5031,  ..., -0.5375, -0.3433, -0.1376]]],
       grad_fn=<MulBackward0>), past_key_values=DynamicCache(layers=[DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer]

In [10]:
dummy_input.shape, output.last_hidden_state.shape

(torch.Size([1, 256]), torch.Size([1, 256, 1536]))

In [11]:
for elem in model_2.parameters():
    elem.requires_grad = False

In [12]:
dummy_input = torch.randint(0, tokenizer.vocab_size, (1, 256))
summary(model_2, input_data=dummy_input)

Layer (type:depth-idx)                        Output Shape              Param #
LlamaModel                                    --                        --
├─Embedding: 1-1                              [1, 256, 1536]            (49,152,000)
├─LlamaRotaryEmbedding: 1-2                   [1, 256, 128]             --
├─ModuleList: 1-3                             --                        --
│    └─LlamaDecoderLayer: 2-1                 [1, 256, 1536]            --
│    │    └─LlamaRMSNorm: 3-1                 [1, 256, 1536]            (1,536)
│    │    └─LlamaAttention: 3-2               [1, 256, 1536]            (5,508,608)
│    │    └─LlamaRMSNorm: 3-3                 [1, 256, 1536]            (1,536)
│    │    └─LlamaMLP: 3-4                     [1, 256, 1536]            (41,307,136)
│    └─LlamaDecoderLayer: 2-2                 [1, 256, 1536]            --
│    │    └─LlamaRMSNorm: 3-5                 [1, 256, 1536]            (1,536)
│    │    └─LlamaAttention: 3-6               [1, 2

In [13]:
for elem in model_2.layers[-15:]:
    for param in elem.parameters():
        param.requires_grad = True

In [14]:
dummy_input = torch.randint(0, tokenizer.vocab_size, (1, 256))
summary(model_2, input_data=dummy_input)

Layer (type:depth-idx)                        Output Shape              Param #
LlamaModel                                    --                        --
├─Embedding: 1-1                              [1, 256, 1536]            (49,152,000)
├─LlamaRotaryEmbedding: 1-2                   [1, 256, 128]             --
├─ModuleList: 1-3                             --                        --
│    └─LlamaDecoderLayer: 2-1                 [1, 256, 1536]            --
│    │    └─LlamaRMSNorm: 3-1                 [1, 256, 1536]            (1,536)
│    │    └─LlamaAttention: 3-2               [1, 256, 1536]            (5,508,608)
│    │    └─LlamaRMSNorm: 3-3                 [1, 256, 1536]            (1,536)
│    │    └─LlamaMLP: 3-4                     [1, 256, 1536]            (41,307,136)
│    └─LlamaDecoderLayer: 2-2                 [1, 256, 1536]            --
│    │    └─LlamaRMSNorm: 3-5                 [1, 256, 1536]            (1,536)
│    │    └─LlamaAttention: 3-6               [1, 2

In [15]:
model_2

LlamaModel(
  (embed_tokens): Embedding(32000, 1536)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
        (k_proj): Linear(in_features=1536, out_features=256, bias=True)
        (v_proj): Linear(in_features=1536, out_features=256, bias=True)
        (o_proj): Linear(in_features=1536, out_features=1536, bias=True)
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=1536, out_features=8960, bias=True)
        (up_proj): Linear(in_features=1536, out_features=8960, bias=True)
        (down_proj): Linear(in_features=8960, out_features=1536, bias=True)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): LlamaRMSNorm((1536,), eps=1e-06)
      (post_attention_layernorm): LlamaRMSNorm((1536,), eps=1e-06)
    )
  )
  (norm): LlamaRMSNorm((1536,), eps=1e-06)
  (rotary_emb): LlamaRotaryEmbedding()
)

In [16]:
model_2.norm.requires_grad=True

In [17]:
dummy_input = torch.randint(0, tokenizer.vocab_size, (1, 256))
summary(model_2, input_data=dummy_input)

Layer (type:depth-idx)                        Output Shape              Param #
LlamaModel                                    --                        --
├─Embedding: 1-1                              [1, 256, 1536]            (49,152,000)
├─LlamaRotaryEmbedding: 1-2                   [1, 256, 128]             --
├─ModuleList: 1-3                             --                        --
│    └─LlamaDecoderLayer: 2-1                 [1, 256, 1536]            --
│    │    └─LlamaRMSNorm: 3-1                 [1, 256, 1536]            (1,536)
│    │    └─LlamaAttention: 3-2               [1, 256, 1536]            (5,508,608)
│    │    └─LlamaRMSNorm: 3-3                 [1, 256, 1536]            (1,536)
│    │    └─LlamaMLP: 3-4                     [1, 256, 1536]            (41,307,136)
│    └─LlamaDecoderLayer: 2-2                 [1, 256, 1536]            --
│    │    └─LlamaRMSNorm: 3-5                 [1, 256, 1536]            (1,536)
│    │    └─LlamaAttention: 3-6               [1, 2

In [18]:
model = AutoModel.from_pretrained(model_name, num_labels=3)

In [19]:
model

LlamaModel(
  (embed_tokens): Embedding(32000, 1536)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
        (k_proj): Linear(in_features=1536, out_features=256, bias=True)
        (v_proj): Linear(in_features=1536, out_features=256, bias=True)
        (o_proj): Linear(in_features=1536, out_features=1536, bias=True)
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=1536, out_features=8960, bias=True)
        (up_proj): Linear(in_features=1536, out_features=8960, bias=True)
        (down_proj): Linear(in_features=8960, out_features=1536, bias=True)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): LlamaRMSNorm((1536,), eps=1e-06)
      (post_attention_layernorm): LlamaRMSNorm((1536,), eps=1e-06)
    )
  )
  (norm): LlamaRMSNorm((1536,), eps=1e-06)
  (rotary_emb): LlamaRotaryEmbedding()
)

In [20]:
model_2

LlamaModel(
  (embed_tokens): Embedding(32000, 1536)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
        (k_proj): Linear(in_features=1536, out_features=256, bias=True)
        (v_proj): Linear(in_features=1536, out_features=256, bias=True)
        (o_proj): Linear(in_features=1536, out_features=1536, bias=True)
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=1536, out_features=8960, bias=True)
        (up_proj): Linear(in_features=1536, out_features=8960, bias=True)
        (down_proj): Linear(in_features=8960, out_features=1536, bias=True)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): LlamaRMSNorm((1536,), eps=1e-06)
      (post_attention_layernorm): LlamaRMSNorm((1536,), eps=1e-06)
    )
  )
  (norm): LlamaRMSNorm((1536,), eps=1e-06)
  (rotary_emb): LlamaRotaryEmbedding()
)

In [21]:
for param in model_2.norm.parameters():
    print(param)

Parameter containing:
tensor([2.6406, 2.9375, 2.9219,  ..., 3.0000, 3.0312, 3.0938])
