# Ejercicio #1

In [1]:
import os
import certifi
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()


In [2]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


PanicException: called `Result::unwrap()` on an `Err` value: "Python version string has too many parts"

In [2]:
# Vamos a cargar una version preentrenada de BERT
model = BertModel.from_pretrained('bert-base-uncased')

# Mover el modelo a GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [3]:
named_params = list(model.named_parameters())

# Imprimir la cantidad de parámetros y sus nombres
print(f"Total de parámetros: {len(named_params)}")

Total de parámetros: 199


In [4]:
# Imprimir la capa de embedding
print("======Capa de Embedding======\n")
for p in named_params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].shape))))

# Imprimir el primer encoder
print("\n======Primer Encoder======\n")
for p in named_params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].shape))))

# Imprimir el segundo encoder
print("\n======Segundo Encoder======\n")
for p in named_params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].shape))))


embeddings.word_embeddings.weight                       (30522, 768)
embeddings.position_embeddings.weight                     (512, 768)
embeddings.token_type_embeddings.weight                     (2, 768)
embeddings.LayerNorm.weight                                   (768,)
embeddings.LayerNorm.bias                                     (768,)


encoder.layer.0.attention.self.query.weight               (768, 768)
encoder.layer.0.attention.self.query.bias                     (768,)
encoder.layer.0.attention.self.key.weight                 (768, 768)
encoder.layer.0.attention.self.key.bias                       (768,)
encoder.layer.0.attention.self.value.weight               (768, 768)
encoder.layer.0.attention.self.value.bias                     (768,)
encoder.layer.0.attention.output.dense.weight             (768, 768)
encoder.layer.0.attention.output.dense.bias                   (768,)
encoder.layer.0.attention.output.LayerNorm.weight             (768,)
encoder.layer.0.attention.outpu

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
tokenizer.encode("Othon loves to code in Python", return_tensors='pt').to(device)

tensor([[  101, 27178,  8747,  7459,  2000,  3642,  1999, 18750,   102]],
       device='cuda:0')

In [7]:
# Pasamos los tokens a través del modelo
input_ids = tokenizer.encode("Othon loves to code in Python", return_tensors='pt')
input_ids = input_ids.to(device)
with torch.no_grad():
    outputs = model(input_ids)

# Extraemos las representaciones de la última capa
last_hidden_states = outputs.last_hidden_state

print(last_hidden_states)

tensor([[[-0.2213,  0.4771, -0.3867,  ..., -0.1356,  0.6587,  0.4269],
         [ 0.8593,  0.1995,  0.5775,  ...,  0.1166,  0.9928,  0.0974],
         [-0.0088,  0.0421, -0.0123,  ...,  0.1461,  0.5752, -0.0631],
         ...,
         [-0.8762,  0.4043, -0.4904,  ..., -0.6379, -0.3283,  0.4589],
         [-0.1807,  0.6547, -0.7054,  ..., -0.2865,  0.2484, -0.2530],
         [ 0.6761,  0.0906, -0.2353,  ...,  0.1048, -0.7769, -0.2543]]],
       device='cuda:0')


In [8]:
last_hidden_states.shape

torch.Size([1, 9, 768])

In [25]:
# Esta capa en particular esta entrenada para generar una representación
# de la secuencia completa
outputs.pooler_output.shape

torch.Size([1, 768])

In [22]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [26]:
# Tomaremos la representación final del token [CLS] para calcular la similitud
cls_embedding = outputs.last_hidden_state[:, 0, :].unsqueeze(0)

cls_embedding.shape

torch.Size([1, 1, 768])

In [27]:
model.pooler(cls_embedding).shape

torch.Size([1, 768])

In [28]:
# Si corremos el embedding de CLS a través de la capa pooler obtenemos
# la misma representación que si lo hacemos directamente

(model.pooler(cls_embedding) == outputs.pooler_output).all()

tensor(True, device='cuda:0')

In [9]:
# De aquí podemos sacar tambien la cantidad total de parámetros del modelo
total_params = sum(p.numel() for p in model.parameters())
print(f"Total de parámetros del modelo: {total_params:,}")

Total de parámetros del modelo: 109,482,240
