# Emotion Intent Classification

# Precompute the mean, var, vy, normbias for each LayerNorm

# Load Model

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("gokuls/BERT-tiny-emotion-intent")
model = AutoModelForSequenceClassification.from_pretrained("gokuls/BERT-tiny-emotion-intent")

model.eval()
model.double()

  from .autonotebook import tqdm as notebook_tqdm


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

# Load dataset

In [2]:
# Load validation dataset
from datasets import load_dataset

train_dataset = load_dataset("dair-ai/emotion", split="train")

In [9]:
# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example['text'], truncation=False, padding=False)
tokenized_valid_dataset = train_dataset.map(tokenize_function, batched=True)

# Check dataset max tokenized length
max_length = max(len(sample) for sample in tokenized_valid_dataset['input_ids'])
print(f"Max length: {max_length}")


Map: 100%|██████████| 16000/16000 [00:00<00:00, 48583.18 examples/s]

Max length: 87





# Process

In [13]:
import math
import numpy as np
from tqdm import tqdm

mean_distribution_0_0 = []
var_distribution_0_0 = []
vy_distribution_0_0 = []
normbias_distribution_0_0 = []

mean_distribution_0_1 = []
var_distribution_0_1 = []
vy_distribution_0_1 = []
normbias_distribution_0_1 = []

mean_distribution_1_0 = []
var_distribution_1_0 = []
vy_distribution_1_0 = []
normbias_distribution_1_0 = []

mean_distribution_1_1 = []
var_distribution_1_1 = []
vy_distribution_1_1 = []
normbias_distribution_1_1 = []

for sentence in tqdm(train_dataset['text']):
    text = "[CLS] " + sentence + " [SEP]"

    tokenized = tokenizer(text)
    tokenized_text = tokenizer.tokenize(text)
    segments_ids = [1] * len(tokenized_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # Embeddings
    x = model.bert.embeddings(tokens_tensor, segments_tensors)
    original_input_tensor = x.double()
    input_tensor = x.double()

    # Self-Attention
    fin = model.bert.encoder.layer[0].attention.self(x)[0].double()

    w_output_dense = model.bert.encoder.layer[0].attention.output.dense.weight.clone().detach().double().transpose(0, 1)
    b_output_dense = model.bert.encoder.layer[0].attention.output.dense.bias.clone().detach().double()

    fin2 = torch.matmul(fin, w_output_dense) + b_output_dense
    fin2_backup = fin2.clone()
    fin2_backup = fin2_backup + original_input_tensor

    mean_0_0 = []
    var_0_0 = []

    fin3_whole = []
    for i in range(len(original_input_tensor.squeeze())):
        fin2 = fin2_backup.squeeze()[i]

        current_mean = torch.mean(fin2.squeeze()).item()
        current_var = 1 / math.sqrt(torch.var(fin2.squeeze()).item())

        # save mean and variance
        mean_0_0.append(current_mean)
        var_0_0.append(current_var)

        fin3_corr = (fin2.squeeze() - current_mean) * current_var

        w_output_layernorm = model.bert.encoder.layer[0].attention.output.LayerNorm.weight.clone().detach().double().unsqueeze(0)
        b_output_layernorm = model.bert.encoder.layer[0].attention.output.LayerNorm.bias.clone().detach().double()

        # calculate vy
        vy = w_output_layernorm * current_var
        vy = vy.squeeze(0)
        expanded_vy = vy.unsqueeze(1).repeat(1, max_length) # [128, 55]
        padding_length = 128 - max_length
        padding = torch.zeros(128, padding_length, dtype=vy.dtype)
        vy_expanded = torch.cat((expanded_vy, padding), dim=1)  # Shape: [128, 128]
        
        # save vy & normbias
        normbias = b_output_layernorm
        vy_distribution_0_0.append(vy_expanded)
        normbias_distribution_0_0.append(normbias)

        fin3_corr = fin3_corr * w_output_layernorm + b_output_layernorm
        fin3_whole.append(fin3_corr)

    mean_distribution_0_0.append(np.array(mean_0_0))
    var_distribution_0_0.append(np.array(var_0_0))

    fin3_whole = torch.cat(tuple(fin3_whole), 0).unsqueeze(0)
    fin_4 = torch.matmul(fin3_whole, model.bert.encoder.layer[0].intermediate.dense.weight.transpose(0, 1).double()) + model.bert.encoder.layer[0].intermediate.dense.bias

    fin_5 = torch.nn.functional.gelu(fin_4)
    fin_6 = torch.matmul(fin_5, model.bert.encoder.layer[0].output.dense.weight.transpose(0, 1).double()) + model.bert.encoder.layer[0].output.dense.bias
    fin_6 = fin_6 + fin3_whole

    mean_0_1 = []
    var_0_1 = []

    fin7_whole = []
    for i in range(len(input_tensor.squeeze())):
        fin7 = fin_6.squeeze()[i]

        current_mean = torch.mean(fin7.squeeze()).item()
        current_var = 1 / math.sqrt(torch.var(fin7.squeeze()).item())

        # save mean and variance
        mean_0_1.append(current_mean)
        var_0_1.append(current_var)

        fin7_corr = (fin7.squeeze() - current_mean) * current_var

        w_output_layernorm = model.bert.encoder.layer[0].output.LayerNorm.weight.clone().detach().double().unsqueeze(0)
        b_output_layernorm = model.bert.encoder.layer[0].output.LayerNorm.bias.clone().detach().double()

        # calculate vy
        vy = w_output_layernorm * current_var
        vy = vy.squeeze(0)
        expanded_vy = vy.unsqueeze(1).repeat(1, max_length) # [128, 55]
        padding_length = 128 - max_length
        padding = torch.zeros(128, padding_length, dtype=vy.dtype)
        vy_expanded = torch.cat((expanded_vy, padding), dim=1)  # Shape: [128, 128]
        
        # save vy & normbias
        normbias = b_output_layernorm
        vy_distribution_0_1.append(vy_expanded)
        normbias_distribution_0_1.append(normbias)

        fin7_corr = fin7_corr * w_output_layernorm + b_output_layernorm
        fin7_whole.append(fin7_corr)
        

    mean_distribution_0_1.append(np.array(mean_0_1))
    var_distribution_0_1.append(np.array(var_0_1))

    fin7_whole = torch.cat(tuple(fin7_whole), 0).unsqueeze(0)

    original_input_tensor = fin7_whole

    fin = model.bert.encoder.layer[1].attention.self(fin7_whole)[0].double()

    w_output_dense = model.bert.encoder.layer[1].attention.output.dense.weight.clone().detach().double().transpose(0, 1)
    b_output_dense = model.bert.encoder.layer[1].attention.output.dense.bias.clone().detach().double()

    fin2 = torch.matmul(fin, w_output_dense) + b_output_dense
    fin2_backup = fin2.clone()
    fin2_backup = fin2_backup + original_input_tensor

    mean_1_0 = []
    var_1_0 = []

    fin3_whole = []
    for i in range(len(original_input_tensor.squeeze())):
        fin2 = fin2_backup.squeeze()[i]

        current_mean = torch.mean(fin2.squeeze()).item()
        current_var = 1 / math.sqrt(torch.var(fin2.squeeze()).item())

        mean_1_0.append(current_mean)
        var_1_0.append(current_var)

        fin3_corr = (fin2.squeeze() - current_mean) * current_var

        w_output_layernorm = model.bert.encoder.layer[1].attention.output.LayerNorm.weight.clone().detach().double().unsqueeze(0)
        b_output_layernorm = model.bert.encoder.layer[1].attention.output.LayerNorm.bias.clone().detach().double()

        # calculate vy
        vy = w_output_layernorm * current_var
        vy = vy.squeeze(0)
        expanded_vy = vy.unsqueeze(1).repeat(1, max_length) # [128, 55]
        padding_length = 128 - max_length
        padding = torch.zeros(128, padding_length, dtype=vy.dtype)
        vy_expanded = torch.cat((expanded_vy, padding), dim=1)  # Shape: [128, 128]
        
        # save vy & normbias
        normbias = b_output_layernorm
        vy_distribution_1_0.append(vy_expanded)
        normbias_distribution_1_0.append(normbias)

        fin3_corr = fin3_corr * w_output_layernorm + b_output_layernorm
        fin3_whole.append(fin3_corr)

    mean_distribution_1_0.append(np.array(mean_1_0))
    var_distribution_1_0.append(np.array(var_1_0))

    fin3_whole = torch.cat(tuple(fin3_whole), 0).unsqueeze(0)
    fin_4 = torch.matmul(fin3_whole, model.bert.encoder.layer[1].intermediate.dense.weight.transpose(0, 1).double()) + model.bert.encoder.layer[1].intermediate.dense.bias

    fin_5 = torch.nn.functional.gelu(fin_4)
    fin_6 = torch.matmul(fin_5, model.bert.encoder.layer[1].output.dense.weight.transpose(0, 1).double()) + model.bert.encoder.layer[1].output.dense.bias
    fin_6 = fin_6 + fin3_whole
    
    mean_1_1 = []
    var_1_1 = []

    fin7_whole = []
    for i in range(len(input_tensor.squeeze())):
        fin7 = fin_6.squeeze()[i]

        current_mean = torch.mean(fin7.squeeze()).item()
        current_var = 1 / math.sqrt(torch.var(fin7.squeeze()).item())

        mean_1_1.append(current_mean)
        var_1_1.append(current_var)

        fin7_corr = (fin7.squeeze() - current_mean) * current_var

        w_output_layernorm = model.bert.encoder.layer[1].output.LayerNorm.weight.clone().detach().double().unsqueeze(0)
        b_output_layernorm = model.bert.encoder.layer[1].output.LayerNorm.bias.clone().detach().double()

        
        # calculate vy
        vy = w_output_layernorm * current_var
        vy = vy.squeeze(0)
        expanded_vy = vy.unsqueeze(1).repeat(1, max_length) # [128, 55]
        padding_length = 128 - max_length
        padding = torch.zeros(128, padding_length, dtype=vy.dtype)
        vy_expanded = torch.cat((expanded_vy, padding), dim=1)  # Shape: [128, 128]
        
        # save vy & normbias
        normbias = b_output_layernorm
        vy_distribution_1_1.append(vy_expanded)
        normbias_distribution_1_1.append(normbias)
        
        fin7_corr = fin7_corr * w_output_layernorm + b_output_layernorm
        fin7_whole.append(fin7_corr.unsqueeze(0))

    mean_distribution_1_1.append(np.array(mean_1_1))
    var_distribution_1_1.append(np.array(var_1_1))

    fin7_whole = torch.cat(tuple(fin7_whole), 0).unsqueeze(0)

100%|██████████| 16000/16000 [02:29<00:00, 107.33it/s]


In [14]:
import numpy as np

precomputed_mean_0_0 = []
precomputed_var_0_0 = []
precomputed_mean_0_1 = []
precomputed_var_0_1 = []
precomputed_mean_1_0 = []
precomputed_var_1_0 = []
precomputed_mean_1_1 = []
precomputed_var_1_1 = []

precomputed_normbias_0_0 = []
precomputed_normbias_0_1 = []
precomputed_normbias_1_0 = []
precomputed_normbias_1_1 = []

current_mean_distribution = mean_distribution_0_0
current_var_distribution = var_distribution_0_0
current_normbias_distribution = normbias_distribution_0_0

for layer in range(4):
    print(f"Processing layer {layer}")
    # Calculate mean for each token position across all samples
    max_length = max(len(sample) for sample in current_mean_distribution)
    total_means = np.zeros(128)  # Initialize with 128 positions
    total_vars = np.zeros(128)  # Initialize with 128 positions
    total_normbias = np.zeros(128)  # Initialize with 128 positions
    counts = np.zeros(128)  # Initialize with 128 positions

    for sample in current_mean_distribution:
        for i, value in enumerate(sample):
            if i < 128:  # Only consider up to 128 positions
                total_means[i] += value
                #total_vars[i] += np.var(value)
                #total_normbias[i] += np.mean(value)
                counts[i] += 1
    
    for sample in current_var_distribution:
        for i, value in enumerate(sample):
            if i < 128:  # Only consider up to 128 positions
                total_vars[i] += value

    # Calculate average mean for each position
    average_means = np.zeros(128)
    average_vars = np.zeros(128)

    for i in range(128):
        if counts[i] > 0:
            average_means[i] = total_means[i] / counts[i]
            average_vars[i] = total_vars[i] / counts[i]
            #average_normbias[i] = total_normbias[i] / counts[i]
        # If count is 0, the average_means[i] remains 0
    
    # Calculate average normbias
    average_normbias = np.mean(current_normbias_distribution, axis=0)

    if layer == 0:
        precomputed_mean_0_0.append(average_means)
        precomputed_var_0_0.append(average_vars)
        precomputed_normbias_0_0.append(average_normbias)
        current_mean_distribution = mean_distribution_0_1
        current_var_distribution = var_distribution_0_1
        current_normbias_distribution = normbias_distribution_0_1
    elif layer == 1:
        precomputed_mean_0_1.append(average_means)
        precomputed_var_0_1.append(average_vars)
        precomputed_normbias_0_1.append(average_normbias)
        current_mean_distribution = mean_distribution_1_0
        current_var_distribution = var_distribution_1_0
        current_normbias_distribution = normbias_distribution_1_0
    elif layer == 2:
        precomputed_mean_1_0.append(average_means)
        precomputed_var_1_0.append(average_vars)
        precomputed_normbias_1_0.append(average_normbias)
        current_mean_distribution = mean_distribution_1_1
        current_var_distribution = var_distribution_1_1
        current_normbias_distribution = normbias_distribution_1_1
    elif layer == 3:
        precomputed_mean_1_1.append(average_means)
        precomputed_var_1_1.append(average_vars)
        precomputed_normbias_1_1.append(average_normbias)

Processing layer 0
Processing layer 1
Processing layer 2
Processing layer 3


In [15]:
def compute_vy_mean(vy_distribution):
    # Stack all vy tensors
    stacked_vy = torch.stack(vy_distribution)
    
    # Compute mean along the first dimension (across all inputs)
    mean_vy = torch.mean(stacked_vy, dim=0)
    
    return mean_vy.cpu().numpy()

# Compute mean for each vy distribution
precomputed_vy_0_0 = compute_vy_mean(vy_distribution_0_0)
precomputed_vy_0_1 = compute_vy_mean(vy_distribution_0_1)
precomputed_vy_1_0 = compute_vy_mean(vy_distribution_1_0)
precomputed_vy_1_1 = compute_vy_mean(vy_distribution_1_1)

# Save precomputed weights

In [22]:
import os

if not os.path.exists("./emotion-precompute"):
    os.makedirs("./emotion-precompute")

np.savetxt("./emotion-precompute/layer0_selfoutput_mean.txt", precomputed_mean_0_0, delimiter='\n')
np.savetxt("./emotion-precompute/layer0_output_mean.txt", precomputed_mean_0_1, delimiter='\n')
np.savetxt("./emotion-precompute/layer1_selfoutput_mean.txt", precomputed_mean_1_0, delimiter='\n')
np.savetxt("./emotion-precompute/layer1_output_mean.txt", precomputed_mean_1_1, delimiter='\n')

np.savetxt("./emotion-precompute/layer0_selfoutput_var.txt", precomputed_var_0_0, delimiter='\n')
np.savetxt("./emotion-precompute/layer0_output_var.txt", precomputed_var_0_1, delimiter='\n')
np.savetxt("./emotion-precompute/layer1_selfoutput_var.txt", precomputed_var_1_0, delimiter='\n')
np.savetxt("./emotion-precompute/layer1_output_var.txt", precomputed_var_1_1, delimiter='\n')

np.savetxt("./emotion-precompute/layer0_selfoutput_vy.txt", precomputed_vy_0_0, delimiter=',')
np.savetxt("./emotion-precompute/layer0_output_vy.txt", precomputed_vy_0_1, delimiter=',')
np.savetxt("./emotion-precompute/layer1_selfoutput_vy.txt", precomputed_vy_1_0, delimiter=',')
np.savetxt("./emotion-precompute/layer1_output_vy.txt", precomputed_vy_1_1, delimiter=',')

np.savetxt("./emotion-precompute/layer0_selfoutput_normbias.txt", precomputed_normbias_0_0, delimiter='\n')
np.savetxt("./emotion-precompute/layer0_output_normbias.txt", precomputed_normbias_0_1, delimiter='\n')
np.savetxt("./emotion-precompute/layer1_selfoutput_normbias.txt", precomputed_normbias_1_0, delimiter='\n')
np.savetxt("./emotion-precompute/layer1_output_normbias.txt", precomputed_normbias_1_1, delimiter='\n')

# Validation

In [None]:
# Calculate precision
def precision(correct, approx):
    if isinstance(approx, list):
        approx = np.array(approx)
    absolute = np.sum(np.abs(correct - approx)) / len(correct)
    relative = absolute / (np.sum(np.abs(correct)) / len(correct))
    return 1 - relative

# Load real mean
real_mean_0_0 = np.loadtxt("./weights-sst2/layer0_selfoutput_mean.txt")
real_mean_0_1 = np.loadtxt("./weights-sst2/layer0_output_mean.txt")
real_mean_1_0 = np.loadtxt("./weights-sst2/layer1_selfoutput_mean.txt")
real_mean_1_1 = np.loadtxt("./weights-sst2/layer1_output_mean.txt")

# Load real vy & normbias
real_vy_0_0 = np.loadtxt("./weights-sst2/layer0_selfoutput_vy.txt", delimiter=',')
real_vy_0_1 = np.loadtxt("./weights-sst2/layer0_output_vy.txt", delimiter=',')
real_vy_1_0 = np.loadtxt("./weights-sst2/layer1_selfoutput_vy.txt", delimiter=',')
real_vy_1_1 = np.loadtxt("./weights-sst2/layer1_output_vy.txt", delimiter=',')

real_normbias_0_0 = np.loadtxt("./weights-sst2/layer0_selfoutput_normbias.txt", delimiter=',')
real_normbias_0_1 = np.loadtxt("./weights-sst2/layer0_output_normbias.txt", delimiter=',')
real_normbias_1_0 = np.loadtxt("./weights-sst2/layer1_selfoutput_normbias.txt", delimiter=',')
real_normbias_1_1 = np.loadtxt("./weights-sst2/layer1_output_normbias.txt", delimiter=',')

In [None]:
# Calculate precision
mean1 = precision(real_mean_0_0, precomputed_mean_0_0)
mean2 = precision(real_mean_0_1, precomputed_mean_0_1)
mean3 = precision(real_mean_1_0, precomputed_mean_1_0)
mean4 = precision(real_mean_1_1, precomputed_mean_1_1)

vy1 = precision(real_vy_0_0, precomputed_vy_0_0)
vy2 = precision(real_vy_0_1, precomputed_vy_0_1)
vy3 = precision(real_vy_1_0, precomputed_vy_1_0)
vy4 = precision(real_vy_1_1, precomputed_vy_1_1)

normbias1 = precision(real_normbias_0_0, precomputed_normbias_0_0)
normbias2 = precision(real_normbias_0_1, precomputed_normbias_0_1)
normbias3 = precision(real_normbias_1_0, precomputed_normbias_1_0)
normbias4 = precision(real_normbias_1_1, precomputed_normbias_1_1)

print(f"Precision of mean_0_0: {mean1}")
print(f"Precision of mean_0_1: {mean2}")
print(f"Precision of mean_1_0: {mean3}")
print(f"Precision of mean_1_1: {mean4}")

print(f"Precision of vy_0_0: {vy1}")
print(f"Precision of vy_0_1: {vy2}")
print(f"Precision of vy_1_0: {vy3}")
print(f"Precision of vy_1_1: {vy4}")

print(f"Precision of normbias_0_0: {normbias1}")
print(f"Precision of normbias_0_1: {normbias2}")
print(f"Precision of normbias_1_0: {normbias3}")
print(f"Precision of normbias_1_1: {normbias4}")