In [1]:
import os
from dotenv import load_dotenv
from transformers import AutoModelForSequenceClassification, AutoTokenizer, logging
import torch
import numpy as np

In [2]:
# Load tokenizer & model
logging.set_verbosity_error()
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny-mnli")
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny")



In [5]:
# Load fine-tuned model weights
trained = torch.load('SST-2-BERT-tiny.bin', map_location=torch.device('cpu'))
model.load_state_dict(trained, strict=False)

  trained = torch.load('SST-2-BERT-tiny.bin', map_location=torch.device('cpu'))


_IncompatibleKeys(missing_keys=[], unexpected_keys=['bert.embeddings.position_ids'])

In [6]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [8]:
# Input data
text = "Nuovo Cinema Paradiso has been an incredible movie! A gem in the italian culture."
text = "[CLS] " + text + " [SEP]"

In [9]:
# Tokenize input data
#tokenized = tokenizer(text) # <- this would return the input_ids and attention mask as well
tokenized_text = tokenizer.tokenize(text) # <- tokenize the input data
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # <- convert tokens into ids
tokens_tensor = torch.tensor([indexed_tokens]) # <- convert token ids into tensor

#print("tokenized: ", tokenized)
#print("tokenized_text: ", tokenized_text)
#print("indexed_tokens: ", indexed_tokens)
#print("tokens_tensor: ", tokens_tensor)

# Embedding calculation
x = model.bert.embeddings(tokens_tensor, torch.tensor([[1] * len(tokenized_text)])) # require token ids tensor and attention mask list

print("x: ", x)

x:  tensor([[[ 0.7311, -0.0275, -8.5770,  ...,  1.3273,  0.1579, -0.1398],
         [ 0.0530, -0.2743, -0.2208,  ..., -1.4017, -0.5093, -0.7404],
         [-2.7270, -1.2554, -0.5465,  ..., -1.0224, -0.0679, -0.2995],
         ...,
         [-0.2928, -0.8412, -0.2279,  ..., -0.8374, -1.1446,  1.5150],
         [ 0.3317,  0.4604, -0.2388,  ...,  0.6477, -0.1716, -2.4884],
         [-1.4311,  0.4480, -0.1306,  ...,  1.0596, -1.3526, -0.9108]]],
       grad_fn=<NativeLayerNormBackward0>)


In [11]:
# Save the embedding into text file
path = "./inputs/0"
for i in range(len(x[0])):
    if not (os.path.exists(path)):
        os.makedirs(path)
    np.savetxt("./inputs/0/input_{}.txt".format(i), x[0][i].detach(), delimiter=",")

In [17]:
# Layer 1 - Self Attention
# fetch the weights of Q, K, V
query_weight = model.bert.encoder.layer[0].attention.self.query.weight.clone().detach().double().transpose(0, 1)
key_weight = model.bert.encoder.layer[0].attention.self.key.weight.clone().detach().double().transpose(0, 1)
value_weight = model.bert.encoder.layer[0].attention.self.value.weight.clone().detach().double().transpose(0, 1)

# fetch the biases of Q, K, V
query_bias = model.bert.encoder.layer[0].attention.self.query.bias.clone().detach().double()
key_bias = model.bert.encoder.layer[0].attention.self.key.bias.clone().detach().double()
value_bias = model.bert.encoder.layer[0].attention.self.value.bias.clone().detach().double()

In [19]:
# convert input embeddings into double
input_tensor = x.double()

# calculate the new Q, K, V based on the new input embeddings
query = torch.matmul(input_tensor, query_weight) + query_bias
key = torch.matmul(input_tensor, key_weight) + key_bias
value = torch.matmul(input_tensor, value_weight) + value_bias

# reshape matrics for multi-head attention
query = query.reshape([1, input_tensor.size()[1], 2, 64]) # 2 -> no. of attention heads, 64 -> dimension of each head (hidden size/no. of attention heads)
key = key.reshape([1, input_tensor.size()[1], 2, 64])
value = value.reshape([1, input_tensor.size()[1], 2, 64])

# permute to adjust the dimensions of the tensors for the dot product operations
query = query.permute([0, 2, 1, 3])
key = key.permute([0, 2, 3, 1])

# (Q * K) / square root of dk
qk = torch.matmul(query, key)
qk = qk / 8 # 1 / square root of 64

# Softmax()
qk_softmaxed = torch.softmax(qk, -1)

# permute value
value = value.permute([0, 2, 1, 3])

# 
dot_product = torch.matmul(qk_softmaxed, value)
# permute & reshape for further layer operations
dot_product = dot_product.permute([0, 2, 1, 3])
dot_product = dot_product.reshape([1, input_tensor.size()[1], 128])

print("dot_product: ", dot_product)

dot_product:  tensor([[[-0.3108, -0.0247,  0.8014,  ..., -0.2286,  1.3143, -0.9430],
         [-0.6416, -0.5837,  0.7616,  ..., -0.7451,  0.7322, -0.8307],
         [ 0.1578,  0.1372,  1.0732,  ..., -0.7571,  1.0376,  0.0132],
         ...,
         [-0.5239,  0.3390,  0.1429,  ..., -0.2271,  0.8409, -0.3488],
         [-0.1250,  0.4574,  0.2978,  ..., -0.3089,  1.2559, -0.6545],
         [-0.2520,  0.2029,  0.5690,  ..., -0.1556,  1.0461, -0.7791]]],
       dtype=torch.float64, grad_fn=<UnsafeViewBackward0>)
