In [60]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
model = BertModel.from_pretrained("bert-base-uncased")

In [62]:
named_params=list(model.named_parameters())
print("Bert named param "+str(len(named_params)))

print("Embedding Layer")
for p in named_params[0:5]:
    print(p[0]+" "+str(tuple(p[1].size())))

print("First Encoder")
for p in named_params[5:21]:
    print(p[0]+" "+str(tuple(p[1].size())))

print("Output Layer")
for p in named_params[-2:]:
    print(p[0]+" "+str(tuple(p[1].size())))


Bert named param 199
Embedding Layer
embeddings.word_embeddings.weight (30522, 768)
embeddings.position_embeddings.weight (512, 768)
embeddings.token_type_embeddings.weight (2, 768)
embeddings.LayerNorm.weight (768,)
embeddings.LayerNorm.bias (768,)
First Encoder
encoder.layer.0.attention.self.query.weight (768, 768)
encoder.layer.0.attention.self.query.bias (768,)
encoder.layer.0.attention.self.key.weight (768, 768)
encoder.layer.0.attention.self.key.bias (768,)
encoder.layer.0.attention.self.value.weight (768, 768)
encoder.layer.0.attention.self.value.bias (768,)
encoder.layer.0.attention.output.dense.weight (768, 768)
encoder.layer.0.attention.output.dense.bias (768,)
encoder.layer.0.attention.output.LayerNorm.weight (768,)
encoder.layer.0.attention.output.LayerNorm.bias (768,)
encoder.layer.0.intermediate.dense.weight (3072, 768)
encoder.layer.0.intermediate.dense.bias (3072,)
encoder.layer.0.output.dense.weight (768, 3072)
encoder.layer.0.output.dense.bias (768,)
encoder.layer.0.o

In [63]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [64]:
tokenizer.encode('Sinan loves a beautiful day')

[101, 8254, 2319, 7459, 1037, 3376, 2154, 102]

In [65]:
response=model(torch.tensor(tokenizer.encode('Sinan loves a beautiful day')).unsqueeze(0))

In [66]:
response

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.2327,  0.1515, -0.0448,  ..., -0.5192,  0.4195,  0.2948],
         [ 0.3051, -0.6614,  0.2500,  ..., -0.9809,  0.2551,  0.2400],
         [-0.3610, -0.8759,  0.4542,  ..., -1.1120,  0.1791,  0.0664],
         ...,
         [ 0.0689, -0.0364,  0.4940,  ..., -0.6558,  0.2227, -0.3868],
         [-0.2657, -0.4257,  0.0056,  ...,  0.1352,  0.3596, -0.4585],
         [ 0.6100,  0.0263, -0.2532,  ..., -0.0680, -0.3901, -0.3541]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.8777, -0.4542, -0.6287,  0.7511,  0.3151, -0.0913,  0.9175,  0.3766,
         -0.3059, -1.0000, -0.0577,  0.7535,  0.9913,  0.2113,  0.9418, -0.5328,
         -0.0568, -0.5698,  0.4090, -0.6096,  0.7876,  0.9995,  0.3670,  0.2453,
          0.4620,  0.9465, -0.6802,  0.9342,  0.9614,  0.7060, -0.5755,  0.2076,
         -0.9910, -0.1697, -0.8019, -0.9952,  0.3786, -0.7309, -0.0599, -0.0186,
         -0.8722,  0.3377,  0.99

In [67]:
response.last_hidden_state

tensor([[[-0.2327,  0.1515, -0.0448,  ..., -0.5192,  0.4195,  0.2948],
         [ 0.3051, -0.6614,  0.2500,  ..., -0.9809,  0.2551,  0.2400],
         [-0.3610, -0.8759,  0.4542,  ..., -1.1120,  0.1791,  0.0664],
         ...,
         [ 0.0689, -0.0364,  0.4940,  ..., -0.6558,  0.2227, -0.3868],
         [-0.2657, -0.4257,  0.0056,  ...,  0.1352,  0.3596, -0.4585],
         [ 0.6100,  0.0263, -0.2532,  ..., -0.0680, -0.3901, -0.3541]]],
       grad_fn=<NativeLayerNormBackward0>)

In [68]:
response.pooler_output.shape

torch.Size([1, 768])

In [69]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [70]:
CLS_embedding=response.last_hidden_state[:,0,:].unsqueeze(0)
CLS_embedding.shape

torch.Size([1, 1, 768])

In [71]:
model.pooler(CLS_embedding).shape

torch.Size([1, 768])

In [72]:
(model.pooler(CLS_embedding)==response.pooler_output).all()

tensor(True)

In [73]:
total_param=0
for p in model.parameters():
    if len(p.shape)==2:
        total_param+=p.shape[0]*p.shape[1]
print("Total param : "+str(total_param))

Total param : 109360128
