In [1]:
# Solo necesario en caso de problemas con los certificados SSL
import os
import certifi
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
os.environ['HF_HOME'] = 'D:\\huggingface_cache' # Cambia esta ruta a la que prefieras

In [2]:
from transformers import GPT2Tokenizer

# Agregamos una prueba para verificar si estamos usando cuda o cpu
# e imprimimos el dispositivo que se está utilizando así como su nombre

import torch
device = 0 if torch.cuda.is_available() else -1
print("Dispositivo utilizado:", "cuda" if device == 0 else "cpu")
if device == 0:
    print("Nombre del dispositivo:", torch.cuda.get_device_name(0))

  from .autonotebook import tqdm as notebook_tqdm


Dispositivo utilizado: cuda
Nombre del dispositivo: NVIDIA T1200 Laptop GPU


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

example_sentence = "I love coding in python"
tokens = tokenizer.tokenize(example_sentence)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
reconstructed_tokens = tokenizer.convert_ids_to_tokens(token_ids)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

Tokens: ['I', 'Ġlove', 'Ġcoding', 'Ġin', 'Ġpython']
Token IDs: [40, 1842, 19617, 287, 21015]
Reconstructed Tokens: ['I', 'Ġlove', 'Ġcoding', 'Ġin', 'Ġpython']


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

oov_sentence = "Othon loves coding in python"
tokens = tokenizer.tokenize(oov_sentence)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
reconstructed_tokens = tokenizer.convert_ids_to_tokens(token_ids)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

Tokens: ['O', 'th', 'on', 'Ġloves', 'Ġcoding', 'Ġin', 'Ġpython']
Token IDs: [46, 400, 261, 10408, 19617, 287, 21015]
Reconstructed Tokens: ['O', 'th', 'on', 'Ġloves', 'Ġcoding', 'Ġin', 'Ġpython']


In [5]:
# Ejemplo de autoregresión con GPT-2
from transformers import set_seed, GPT2LMHeadModel, pipeline
from torch import tensor, numel
from bertviz import model_view

set_seed(42)

In [6]:
generator = pipeline('text-generation', model='gpt2', device=device)

# Imprimimos las salidas del generador de texto
outputs = generator(
    "The current amount of data available for training language models is",
    max_new_tokens=30,
    num_return_sequences=3,
    truncation=True
)

for i, output in enumerate(outputs):
    print(f"\nOutput {i + 1}: {output['generated_text']}\n\n")
    print("----------------------------------------------------------------")

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(



Output 1: The current amount of data available for training language models is too small to be considered a sufficient sample size, and is therefore not subject to a large-scale study. The current results are in addition to the


----------------------------------------------------------------

Output 2: The current amount of data available for training language models is limited, so the model is not completely accurate. We are looking for a new approach, such as training with a single variable, and using a dataset


----------------------------------------------------------------

Output 3: The current amount of data available for training language models is limited by the number of training methods used. Therefore, we are currently working to increase the number of training methods for training language models by implementing training methods


----------------------------------------------------------------


In [7]:
# Descargaremos el modelo GPT-2 y lo cargaremos manualmente para visualizarlo
model = GPT2LMHeadModel.from_pretrained("gpt2", output_attentions=True)
model.to("cuda" if device == 0 else "cpu")

The following generation flags are not valid and may be ignored: ['output_attentions']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# Es importante siempre manejar los tensores en el mismo dispositivo que el modelo
model_device = "cuda" if device == 0 else "cpu"

# Movemos el tensor al mismo dispositivo
encoded_input = tokenizer("Othon loves to code.", return_tensors='pt')
input_ids = encoded_input["input_ids"].to(model_device)

# Obtenemos los embeddings de posición (wpe) para cada una de las palabras
position_ids = torch.arange(input_ids.size(1), device=input_ids.device).unsqueeze(0).expand_as(input_ids)
model.transformer.wpe(position_ids).shape

torch.Size([1, 7, 768])

In [12]:
# Obtenemos ahora los embeddings de tokens (wte) para cada palabra
wte_encoded = model.transformer.wte(input_ids)
print(wte_encoded.shape)


torch.Size([1, 7, 768])


In [None]:
# El input inicial es la suma del embedding de tokens y el embedding de posición
initial_input = model.transformer.wte(input_ids) + model.transformer.wpe(position_ids)
print(initial_input.shape)

torch.Size([1, 7, 768])


In [15]:
initial_input

tensor([[[-0.0461, -0.2066,  0.1599,  ..., -0.1851,  0.1534,  0.0842],
         [-0.0048, -0.1885,  0.1429,  ...,  0.0197, -0.1129, -0.0914],
         [-0.2206, -0.0326,  0.2140,  ..., -0.2712, -0.1248, -0.1405],
         ...,
         [-0.0008, -0.1269,  0.1579,  ..., -0.0074,  0.1089,  0.0720],
         [-0.0841,  0.0145,  0.1413,  ..., -0.3213,  0.1117,  0.1050],
         [ 0.0493, -0.0318,  0.1479,  ..., -0.0710,  0.0533,  0.0938]]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [16]:
total_params = sum(numel(p) for p in model.parameters())
print(f"Número total de parámetros en el modelo: {total_params:,}")

Número total de parámetros en el modelo: 124,439,808
