### Instalar requisitos


In [1]:
# Install required packages
!pip install tensorflow>=2.15.0 tqdm>=4.66 torch==2.4.0 transformers tiktoken

### Baixar pesos do GPT2

In [2]:
# Download script to download the GPT2 weights
import urllib.request
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

# Download GPT2 weights
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

2025-02-14 13:38:14.070647: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-14 13:38:14.086240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739551094.105762  737646 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739551094.112202  737646 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-14 13:38:14.132665: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


### Verificar se baixou corretamente

In [3]:
# Test the GPT2 weights
print("Configuração:", settings)
print("Chaves dos dicionários de parâmetros:", params.keys())
print(params["wte"])
print("Dimensão do tensor de pesos:", params["wte"].shape)

Configuração: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Chaves dos dicionários de parâmetros: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])
[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
Dimensão do tensor de pesos: (50257, 768)


### Carregar um modelo qualquer

In [4]:
# Try to load the GPT2 model
from transformers import GPT2Config, GPT2LMHeadModel

model_name = "pierreguillou/gpt2-small-portuguese"
config = GPT2Config.from_pretrained(model_name)
config.update({
    "n_ctx": 1024,  # Length
    "n_layer": 12,  # Number of layers
    "n_head": 12,   # Number of heads
    "n_embd": 768,  # Embedding dimension
})

# Load the pre-trained model
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
model.eval()

# Verify the model
print(f"Modelo carregado {model_name} com a configuração {config}")

Modelo carregado pierreguillou/gpt2-small-portuguese com a configuração GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.48.3",
  "use_cache": true,
  "vocab_size": 50257
}



### Ajustar configurações do modelo qualquer

In [5]:
# Create dictionary of GPT model sizes
model_configs = {
    "gpt2-small (124M)": {"n_embd": 768, "n_layer": 12, "n_head": 12},
    "gpt2-medium (355M)": {"n_embd": 1024, "n_layer": 24, "n_head": 16},
    "gpt2-large (774M)": {"n_embd": 1280, "n_layer": 36, "n_head": 20},
    "gpt2-xl (1558M)": {"n_embd": 1600, "n_layer": 48, "n_head": 25},
}

# Not trained model config
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "n_ctx": 256,
    "n_embd": 768,
    "n_head": 12,
    "n_layer": 12, 
    "dropout": 0.1,
    "qkv_bias": False
}

# Set the model name to test
model_name = "gpt2-small (124M)"

# Merge configs
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"n_ctx": 1024})     # Token length
NEW_CONFIG.update({"qkv_bias": True})  # Bias for consistency

# Model using the modified config
new_config_obj = GPT2Config(**NEW_CONFIG)
gpt = GPT2LMHeadModel(new_config_obj)
gpt.eval()

print("Modelo com configuração ajustada:", new_config_obj)

Modelo com configuração ajustada: GPT2Config {
  "_attn_implementation_autoset": true,
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "dropout": 0.1,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "qkv_bias": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.48.3",
  "use_cache": true,
  "vocab_size": 50257
}



### Funções para transferir os pesos e gerar texto

In [6]:
import torch
import torch.nn.functional as F
import numpy as np
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

# Return the right tensor as treinable
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right, dtype=torch.float32))

# Load weights to our GPT model
def load_weights_into_gpt(gpt, params):
    # 🔹 Correct positional and token embeddings
    gpt.transformer.wpe.weight = assign(gpt.transformer.wpe.weight, params['wpe'])
    gpt.transformer.wte.weight = assign(gpt.transformer.wte.weight, params['wte'])

    for b in range(len(params["blocks"])):
        # Split the concatenated query, key, value matrices
        q_w, k_w, v_w = np.split(params["blocks"][b]["attn"]["c_attn"]["w"], 3, axis=-1)
        gpt.transformer.h[b].attn.c_attn.weight = assign(
            gpt.transformer.h[b].attn.c_attn.weight, np.concatenate([q_w, k_w, v_w], axis=-1)
        )
        q_b, k_b, v_b = np.split(params["blocks"][b]["attn"]["c_attn"]["b"], 3, axis=-1)
        gpt.transformer.h[b].attn.c_attn.bias = assign(
            gpt.transformer.h[b].attn.c_attn.bias, np.concatenate([q_b, k_b, v_b], axis=-1)
        )
        gpt.transformer.h[b].attn.c_proj.weight = assign(
            gpt.transformer.h[b].attn.c_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"]
        )
        gpt.transformer.h[b].attn.c_proj.bias = assign(
            gpt.transformer.h[b].attn.c_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"]
        )
        gpt.transformer.h[b].mlp.c_fc.weight = assign(
            gpt.transformer.h[b].mlp.c_fc.weight, params["blocks"][b]["mlp"]["c_fc"]["w"]
        )
        gpt.transformer.h[b].mlp.c_fc.bias = assign(
            gpt.transformer.h[b].mlp.c_fc.bias, params["blocks"][b]["mlp"]["c_fc"]["b"]
        )
        gpt.transformer.h[b].mlp.c_proj.weight = assign(
            gpt.transformer.h[b].mlp.c_proj.weight, params["blocks"][b]["mlp"]["c_proj"]["w"]
        )
        gpt.transformer.h[b].mlp.c_proj.bias = assign(
            gpt.transformer.h[b].mlp.c_proj.bias, params["blocks"][b]["mlp"]["c_proj"]["b"]
        )
        gpt.transformer.h[b].ln_1.weight = assign(
            gpt.transformer.h[b].ln_1.weight, params["blocks"][b]["ln_1"]["g"]
        )
        gpt.transformer.h[b].ln_1.bias = assign(
            gpt.transformer.h[b].ln_1.bias, params["blocks"][b]["ln_1"]["b"]
        )
        gpt.transformer.h[b].ln_2.weight = assign(
            gpt.transformer.h[b].ln_2.weight, params["blocks"][b]["ln_2"]["g"]
        )
        gpt.transformer.h[b].ln_2.bias = assign(
            gpt.transformer.h[b].ln_2.bias, params["blocks"][b]["ln_2"]["b"]
        )

    # Final layer norm (ln_f)
    gpt.transformer.ln_f.weight = assign(gpt.transformer.ln_f.weight, params["g"])
    gpt.transformer.ln_f.bias = assign(gpt.transformer.ln_f.bias, params["b"])

    # Output head (tied with token embeddings)
    gpt.lm_head.weight = assign(gpt.lm_head.weight, params["wte"])


@torch.no_grad()
def generate(model, idx, max_new_tokens, context_size, top_k=50, temperature=1.0):

    # Move input tensor to device
    device = next(model.parameters()).device  
    idx = idx.clone().detach().to(device) 

    for _ in range(max_new_tokens):
        # Model's context size
        input_tensor = idx[:, -context_size:]

        # Get logits
        logits = model(input_tensor).logits
        logits = logits[:, -1, :]

        # Temperature
        logits /= temperature

        # Keep top-k probabilities
        if top_k > 0:
            values, indices = torch.topk(logits, top_k)
            logits[logits < values[:, -1]] = -float("Inf")

        # Convert logits to probabilities
        probs = F.softmax(logits, dim=-1)
        # Sample next token
        next_token = torch.multinomial(probs, num_samples=1)
        # Append
        idx = torch.cat([idx, next_token], dim=1)

    return idx


### Instanciar o modelo pegando camada do GPT-2

In [7]:
# Try to load the GPT 2 weights into our model
load_weights_into_gpt(gpt, params)
gpt.to('cuda')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### Gerar texto

In [17]:
# Generate some text just to test it
torch.manual_seed(123)
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("My NLP professor is", tokenizer).to('cuda'),
    max_new_tokens=25,
    context_size=NEW_CONFIG["n_ctx"],
    top_k=50,
    temperature=1.5
)
print("Saída:\n", token_ids_to_text(token_ids, tokenizer))

Saída:
 My NLP professor is the dean's assistant, helping students with the student writing requirement, student engagement with their institution and learning as you move up.
