In [1]:
from transformers import BertConfig, BertTokenizer, BertModel
import torch
import tqdm
import numpy as np
import matplotlib.pyplot as plt
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [100]:
from src.analyze_bert_base import compute_basic_weight_stats, \
                                    analyze_bert_self_attn, \
                                    analyze_bert_ffn, \
                                    compute_ffn_weight_stats

In [81]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### BERT-base Overview
[bert-base-uncased](https://huggingface.co/bert-base-uncased) has 110M parameters and was pretrained on BookCorpus and English Wikipedia on two tasks: masked language modeling (MLM) and next sentence predicition (NSP).
<br>
It's architecture is a multi-layer bidirectional Transformer with 12 layers, a hidden dimension of 768, and 12 attention heads. Each of the 12 layers is of the form:

#### Attention
|        Self-Attention (in=768, out=768)
    <br>
|        Dropout
    <br>
|        Self-Attention Output Feed-Forward (in=768, out=768)
    <br>
|        Layer Norm
    <br>
|        Dropout

#### Intermediate

|       Feed-Forward (in=768, out=3072)
    <br>
|       GELU Activation

#### Output

|       Feed-Forward (in=3072, out=768)
    <br>
|       Layer Norm
    <br>
|        Dropout
<br>
There is also a final "BertPooler" layer that consists of a Feed Forward (in=768, out=768) and a Tanh() activation
    


In [89]:
model.encoder.layer[0]

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [92]:
outgoing_df, incoming_df = compute_basic_weight_stats(model.encoder.layer[0].attention.self.query.weight.detach().numpy())

In [93]:
df1.shape

(768, 4)

In [94]:
outgoing_df.head()

Unnamed: 0,nonzero,positive,nz_avg,nz_abs_avg
0,768,361,-0.002436,0.033115
1,768,379,0.000741,0.036338
2,768,374,-0.000108,0.034288
3,768,414,0.002157,0.03383
4,768,383,-0.001291,0.031552


In [95]:
dead_df = analyze_bert_self_attn(model, 5)

In [96]:
dead_df

Unnamed: 0,q_weights,k_weights,v_weights,ffn_weights
output,0,0,0,0
input,0,0,0,0


In [101]:
for layer_num in range(12):
    print(f"Feed-Forward Node Input/Output Correlations for layer {layer_num}:")
    analyze_bert_ffn(model, layer_num)
    

Feed-Forward Node Input/Output Correlations for layer 0:
Hidden layer 1:
    nz_cnt_corr: nan 
    nz_pos_corr: 0.004
    nz_avg_corr: 0.039
    nz_abs_corr: 0.638
    
Hidden layer 2:
    nz_cnt_corr: nan 
    nz_pos_corr: 0.011
    nz_avg_corr: -0.134
    nz_abs_corr: 0.438
    
Feed-Forward Node Input/Output Correlations for layer 1:
Hidden layer 1:
    nz_cnt_corr: nan 
    nz_pos_corr: -0.065
    nz_avg_corr: -0.060
    nz_abs_corr: 0.800
    
Hidden layer 2:
    nz_cnt_corr: nan 
    nz_pos_corr: -0.052
    nz_avg_corr: -0.209
    nz_abs_corr: 0.565
    
Feed-Forward Node Input/Output Correlations for layer 2:
Hidden layer 1:
    nz_cnt_corr: nan 
    nz_pos_corr: -0.036
    nz_avg_corr: -0.049
    nz_abs_corr: 0.793
    
Hidden layer 2:
    nz_cnt_corr: nan 
    nz_pos_corr: 0.027
    nz_avg_corr: -0.209
    nz_abs_corr: 0.339
    
Feed-Forward Node Input/Output Correlations for layer 3:
Hidden layer 1:
    nz_cnt_corr: nan 
    nz_pos_corr: -0.034
    nz_avg_corr: -0.062
    nz