In [1]:
# https://huggingface.co/docs/transformers/en/model_doc/llama3
import transformers
import torch

# https://huggingface.co/meta-llama/Llama-3.2-3B
model_id = "meta-llama/Llama-3.2-3B"

# pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
pipeline = transformers.pipeline("text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto")
print(pipeline.generation_config)
print(f"{pipeline.device=}")
print(f"{pipeline.model.dtype=}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps


GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": 128001,
  "temperature": 0.6,
  "top_p": 0.9
}

pipeline.device=device(type='mps')
pipeline.model.dtype=torch.bfloat16


In [2]:
pipeline.model.config.eos_token_id, pipeline.tokenizer.eos_token_id, pipeline.generation_config.eos_token_id

(128001, 128001, 128001)

In [3]:
input_text = "Simply put, the theory of relativity states that"

# need to set both to supress warning
pipeline.model.generation_config.pad_token_id = pipeline.model.config.eos_token_id
pipeline.tokenizer.pad_token_id=pipeline.model.config.eos_token_id

outputs = pipeline(input_text, batch_size=1, num_return_sequences=2, padding=True, truncation=True, max_length=64)
for output in outputs:
    print(output['generated_text'])
    print()

Simply put, the theory of relativity states that the speed of light is constant. This is an extremely important and revolutionary concept in physics and mathematics. The theory of relativity is based on the assumption that all observers will agree on the speed of light, and this is true for all observers. The speed of light

Simply put, the theory of relativity states that the speed of light is constant for all observers, regardless of the observer's speed. In other words, the speed of light is the same for all observers. This is a fundamental principle of physics and is one of the most important ideas in modern physics. The theory



In [4]:
input_texts = [
  "Simply put, the theory of relativity states that",
  "The phenomenon of global warming refers to the",
]

pipeline.tokenizer.padding_side = "left"
outputs = pipeline(input_texts, batch_size=2, padding=True, truncation=True, max_length=64)
for output in outputs:
    print(output[0]['generated_text'])
    print()

Simply put, the theory of relativity states that the laws of physics are the same for all observers, as long as they all move at the same speed relative to each other. This is a very difficult concept to understand, but the most important thing to remember is that nothing can travel faster than the speed of light

The phenomenon of global warming refers to the warming of the Earth’s climate over the past century. Scientists have identified a number of causes of global warming, including the release of greenhouse gases such as carbon dioxide and methane into the atmosphere. These gases trap heat, causing the Earth to warm. Other causes of global



In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "mps"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
print(tokenizer.__class__)
print(tokenizer.__call__.__code__)
print()
for name, token in tokenizer.special_tokens_map.items():
    print(name, token, tokenizer.convert_tokens_to_ids(token), sep=",\t")
print()
print(f"{tokenizer.padding_side=}")
print(f"{tokenizer.eos_token=}")
print(f"{tokenizer.bos_token=}")

<class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>
<code object __call__ at 0x123285020, file "/Users/jyotirmaya.mahanta/projects/thelonejordan/personal/deeplearning.scratchpad/.venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2783>

bos_token,	<|begin_of_text|>,	128000
eos_token,	<|end_of_text|>,	128001

tokenizer.padding_side='left'
tokenizer.eos_token='<|end_of_text|>'
tokenizer.bos_token='<|begin_of_text|>'


In [7]:
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
# inputs = tokenizer.tokenize(prompts, return_tensors="np", truncation=True, padding=True, max_length=64)
inputs = tokenizer(input_texts, return_tensors="pt", truncation=True, padding=True, max_length=64)
# print(inputs)
token_ids = [i.tolist() for i in inputs["input_ids"]]
attention_mask = [i.tolist() for i in inputs["attention_mask"]]
print("\n".join([str(ids) for ids in token_ids]))
print("\n".join([str(ids) for ids in attention_mask]))
print()
outputs = tokenizer.batch_decode(token_ids, skip_special_tokens=False)
print("\n".join(outputs))

[128000, 61346, 2231, 11, 279, 10334, 315, 1375, 44515, 5415, 430]
[128001, 128001, 128000, 791, 25885, 315, 3728, 24808, 19813, 311, 279]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]

<|begin_of_text|>Simply put, the theory of relativity states that
<|end_of_text|><|end_of_text|><|begin_of_text|>The phenomenon of global warming refers to the


In [8]:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device)
print(model.config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "meta-llama/Llama-3.2-3B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.47.1",
  "use_cache": true,
  "vocab_size": 128256
}



In [9]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

In [10]:
print(f"{model.device=}")
print(f"{model.dtype=}")

model.device=device(type='mps', index=0)
model.dtype=torch.bfloat16


In [11]:
# https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig.from_pretrained.example
from transformers import GenerationConfig

generation_config, unused_kwargs = GenerationConfig.from_pretrained(model_id, return_unused_kwargs=True)
print(generation_config)
print(unused_kwargs)

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": 128001,
  "temperature": 0.6,
  "top_p": 0.9
}

{}


In [12]:
inputs = {k: v.to(device) for k, v in inputs.items()}
# print(inputs)

# need this again to suppress warning
model.generation_config.pad_token_id = tokenizer.pad_token_id

# https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
# https://github.com/huggingface/transformers/blob/a22a4378d97d06b7a1d9abad6e0086d30fdea199/src/transformers/generation/utils.py#L1914
# `**kwargs` passed to generate matching the attributes of `generation_config` will override them.
output_ids = model.generate(**inputs, generation_config=generation_config, max_length=64, temperature=0.9)
print()
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
print("\n\n".join(outputs))


Simply put, the theory of relativity states that the speed of light (speed of electromagnetism) is the same for all observers no matter how fast they may be moving. As one observer travels at a speed of 100 miles per hour, another observer can only see the first observer as moving at 100

The phenomenon of global warming refers to the rising of temperature levels in the atmosphere because of the rise in greenhouse gas levels. Greenhouse gases include Carbon dioxide, water vapour, nitrous oxide, methane, and chlorofluorocarbons (CFCs).
Greenhouse gases are the main reason


In [13]:
# context lengths: 
# https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama/convert_llama_weights_to_hf.py#L96