In [6]:
from torch import nn

def print_info(model):
    
    print("model: ", model.config.model_type)

    linear_module_names = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            linear_module_names.append(name.split(".")[-1])

    print(set(linear_module_names))


### Qwen3

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-0.6B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    device_map="auto"
)

# prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)


thinking content: <think>
Okay, the user wants a short introduction to a large language model. Let me start by recalling what I know about LLMs. They are big language models, right? So I should mention their main features and applications.

First, the introduction needs to be concise. Maybe start with "Large language models" as the main topic. Then explain that they're AI systems designed to understand and generate text. It's important to highlight their ability to process and generate a vast amount of text, which makes them useful in various fields.

I should mention applications like writing, translation, customer service, and maybe something about their adaptability. Also, touch on the advancements in their training data and how they evolve with new information. Oh, and maybe mention that they can handle complex tasks, like writing a research paper or creating content.

Wait, the user wants a short intro. Let me check if I'm including too much. Need to keep it brief but comprehensiv

In [2]:
model_inputs

{'input_ids': tensor([[151644,    872,    198,  35127,    752,    264,   2805,  16800,    311,
           3460,   4128,   1614,     13, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [None]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layer

In [3]:
print_info(model)

model:  qwen3
{'k_proj', 'q_proj', 'up_proj', 'o_proj', 'down_proj', 'v_proj', 'lm_head', 'gate_proj'}


### Llama3.2

In [2]:
import torch
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "unsloth/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer,
    dtype=torch.bfloat16, 
    device_map="auto",
    max_new_tokens=50,
)

pipe("The key to life is")


Device set to use cuda:0


[{'generated_text': 'The key to life is to love one another.\nAnd love one another.\nAnd love one another.\nAnd love one another.\nAnd love one another.\nAnd love one another.\nAnd love one another.\nAnd love one another.\nAnd love one another.\nAnd love one another.\n'}]

In [3]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,)

In [5]:
print_info(model)

model:  llama
{'o_proj', 'lm_head', 'k_proj', 'up_proj', 'gate_proj', 'v_proj', 'q_proj', 'down_proj'}


### Mistral

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "../mistral-7b-bnb-4bit"

model = AutoModelForCausalLM.from_pretrained(model_path, dtype=torch.bfloat16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)

prompt = "My favourite condiment is"

model_inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
tokenizer.batch_decode(generated_ids)[0]

'<s> My favourite condiment is hot sauce. It’s a favourite, mainly because of its versatility which allows you to spice up just about anything. I love experimenting with different sauces, spices, and herbs to craft up something new that brings a fresh flavour to an old favourite. My latest experiments have led me to bring you this recipe – a healthy twist on tuna (or chicken) salad wraps.\n\nFor those living in New Zealand, you can find the recipe for this'

In [3]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
     

In [4]:
print_info(model)

model:  mistral
{'k_proj', 'o_proj', 'lm_head', 'down_proj', 'q_proj', 'up_proj', 'gate_proj', 'v_proj'}


### OPT

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

prompt = ("Once upon a time, in a land far, far away, ")

model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
tokenizer.batch_decode(generated_ids)[0]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/662M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

'</s>Once upon a time, in a land far, far away,                               '

In [5]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [7]:
print_info(model)

model:  opt
{'k_proj', 'q_proj', 'project_out', 'fc1', 'v_proj', 'out_proj', 'project_in', 'lm_head', 'fc2'}
