## Imports 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
import numpy as np
import json 
import os 
import sys 
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification

In [3]:
os.chdir('/home/tromanski/thesis/')
sys.path.append('/home/tromanski/')

In [4]:
from XAI_Transformers_.SST.sst import get_sst_dataset
from XAI_Transformers_.xai_transformer import BertAttention
from XAI_Transformers_.attribution import _compute_rollout_attention

## Setup 

In [5]:
bert_model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2", 
    use_safetensors=True)

loading configuration file config.json from cache at /home/tromanski/.cache/huggingface/hub/models--textattack--bert-base-uncased-SST-2/snapshots/95f0f6f859b35c8ff0863ae3cd4e2dbc702c0ae2/config.json
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "sst-2",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Attempting to create safetensors variant
Safetensors PR exists


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /home/tromanski/.cache/huggingface/hub/models--textattack--bert-base-uncased-SST-2/snapshots/205ffbd1bc5c5b89802266f4948a601f53556b00/model.safetensors
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at textattack/bert-base-uncased-SST-2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-SST-2")

# Load data
datasets  = load_dataset("glue", 'sst2')


label_to_id = {v: i for i, v in enumerate([0,1])}

_, test_data_loader = get_sst_dataset(datasets, tokenizer)

loading configuration file config.json from cache at /home/tromanski/.cache/huggingface/hub/models--textattack--bert-base-uncased-SST-2/snapshots/95f0f6f859b35c8ff0863ae3cd4e2dbc702c0ae2/config.json
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "sst-2",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/tromanski/.cache/huggingface/hub/models--textattack--bert-base-uncased-SST-2/snapshots/95f0f6f859b35c8ff0863ae3cd4e2dbc702c0ae2/

## Using OG Transformers paper 

In [8]:
from safetensors.torch import load_file

# complete code to take in one sample (x) and get attributions
models = {}
gammas = [0.00,0.00, 0.00]
bert_model.bert.embeddings.requires_grad = False
for name, param in bert_model.named_parameters():                
    if name.startswith('embeddings'):
        param.requires_grad = False
        
pretrained_embeds = bert_model.bert.embeddings

params = torch.load('/home/tromanski/XAI_Transformers_/SST/sst2-3layer-model.pt', map_location=torch.device(device))

def rename_params(key):
    for k_ in ['key','query', 'value']:
        key=key.replace(k_, 'p'+k_)
    return key

# this is the standard way of their config for best performing method 
class Config(object):
    
    def __init__(self):
        self.hidden_size = 768
        self.num_attention_heads = 12
        self.layer_norm_eps = 1e-12
        self.n_classes = 2
        self.n_blocks = 3
                    
        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        
        self.detach_layernorm = True # Detaches the attention-block-output LayerNorm
        self.detach_kq = True # Detaches the kq-softmax branch
        self.device = device
        self.train_mode = False
        self.detach_mean = True #

config = Config() # none need to be detached for GAE 
config.detach_layernorm = False # Detaches the attention-block-output LayerNorm
config.detach_mean = False # Detaches the attention-block-output LayerNorm
config.detach_kq = False
model = BertAttention(config, pretrained_embeds)
# params = load_file('/home/tromanski/thesis/results/custom-bert/checkpoint-2105/model.safetensors', device=str(device))
model.load_state_dict(params, strict=False)
model.to(device)
models['none'] = model



### Run 

In [27]:
# from utils.generic import load_custom_bert


# print('1')
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = load_custom_bert(device=device, finetuned=True, explain=True, train=False)
# print('2')
x = test_data_loader[3]
results = {'losses':[], 'attributions':[]}
# model.explain()
import tqdm
for y in tqdm.tqdm(test_data_loader):
    input_ids = torch.tensor(np.float32(x['input_ids']) , requires_grad=True).unsqueeze(0).long().to(device)
    attention_mask = torch.tensor(np.float32(x['attention_mask']), requires_grad=True).unsqueeze(0).long().to(device)
    token_type_ids = torch.tensor(np.float32(x['token_type_ids'])).unsqueeze(0).long().to(device)
    words = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
    y_true = torch.tensor(x['label']).to(device)
    labels_in = torch.tensor([int(y_true)]*len(input_ids)).long().to(device)
    # print('1')
    outs = model(input_ids=input_ids,
                                labels = labels_in)
    # print('2')
    loss = outs['loss'].detach().cpu().numpy()
    y_pred = np.argmax(outs['logits'].squeeze().detach().cpu().numpy())

    gammas = [0.00,0.00, 0.00]
    # print('3')
    outs = model.forward_and_explain_legacy(input_ids=input_ids, cl=y_true,
                    labels = labels_in , method='GAE')
    if og_method:
        attns = [model.attention_probs[k].detach().cpu().numpy() for k in sorted(model.attention_probs.keys())]
        attentions_mat = np.stack(attns,axis=0).squeeze() # L, B, H, T, T
        print('attentions_mat shape:', attentions_mat.shape)
        attns = [model.attention_gradients[k].detach().cpu().numpy() for k in sorted(model.attention_gradients.keys())]
        attentions_grads = np.stack(attns,axis=0).squeeze()
        print('attentions_grads shape:', attentions_grads.shape)
        attentions_mat = torch.tensor(attentions_mat * attentions_grads).clamp(min=0)
        print('attentions_mat after grad mul shape:', attentions_mat.shape)
        attentions_mat = torch.tensor(attentions_mat).clamp(min=0).mean(dim=1)
        print('attentions_mat after mean over heads shape:', attentions_mat.shape)
        joint_attentions = _compute_rollout_attention(attentions_mat)
        print('joint_attentions shape:', joint_attentions.shape)
        joint_attentions[:, 0, 0] = 0
        idx = 0
        attribution = joint_attentions[idx].sum(0)
        print('attribution shape:', attribution.shape)
    else:
    # print('4')
#same as in run sst but then fully torch 
    # L layers, B batch size, H heads, T tokens
        attns = [model.attention_probs[k] for k in sorted(model.attention_probs.keys())]
        # print(torch.stack(attns, dim=0).shape)
        attentions_mat = torch.stack(attns, dim=0) # should be (L, B, H, T, T)
        bs = attentions_mat.shape[1]
        print('attentions_mat shape:', attentions_mat.shape)

        grads = [model.attention_gradients[k] for k in sorted(model.attention_gradients.keys())]
        attentions_grads = torch.stack(grads, dim=0)  # should be (L, B, H, T, T)
        if len(attentions_grads.shape) != 5 and bs ==1:
            attentions_grads = attentions_grads.unsqueeze(1)

        print('attentions_grads shape:', attentions_grads.shape)

        # 2) Compute your attention * grad, staying entirely in torch
        attentions_mat = (attentions_mat * attentions_grads).clamp(min=0)
        attentions_mat = attentions_mat.clamp(min=0).mean(dim=2) # should be (L, B, T, T)
        print('attentions_mat after mean over heads shape:', attentions_mat.shape)

        # 3) rollout must be pure torch ops
        attentions_mat = [attentions_mat[l] for l in range(attentions_mat.size(0))]  # list of (B, T, T) and length L
        joint_attentions = _compute_rollout_attention(attentions_mat)  # must return a Tensor from torch ops only
        print('joint_attentions shape:', joint_attentions.shape)
        # 4) Avoid accidentally disconnecting the graph; in-place is OK on non-leaf
        joint_attentions = joint_attentions.clone()
        joint_attentions[:, 0, 0] = 0
        print('joint_attentions after zeroing CLS-to-CLS shape:', joint_attentions.shape) 
        joint_attentions = torch.stack([joint_attentions, joint_attentions+0.002], dim = 0).squeeze()
        print('joint_attentions after zeroing CLS-to-CLS shape:', joint_attentions.shape)

        idx = 0
        attribution = joint_attentions.sum(1, keepdim=False)  # still differentiable # i think this method is now correct - it's just that I am not sure about whether I actually select the CLS token here now, prev they just selected the first one but more because their func couldnt handle batched inputs 
        # if attribution.dim() == 1:
        #     attribution = attribution.unsqueeze(0)  # make sure attribution is (1, T) for consistency
        print('attribution shape:', attribution.shape)

    results['attributions'].append(attribution.tolist())
    results['losses'].append(loss.tolist())
    print(attribution)
    break 
# attribution = outs['R'].squeeze()
#     results['attributions'].append(attribution)
#     results['losses'].append(loss)

  0%|          | 0/872 [00:00<?, ?it/s]

Saving attention gradients
Saving attention gradients
Saving attention gradients
attentions_mat shape: torch.Size([3, 1, 12, 28, 28])
attentions_grads shape: torch.Size([3, 1, 12, 28, 28])
attentions_mat after mean over heads shape: torch.Size([3, 1, 28, 28])
joint_attentions shape: torch.Size([1, 28, 28])
joint_attentions after zeroing CLS-to-CLS shape: torch.Size([1, 28, 28])
joint_attentions after zeroing CLS-to-CLS shape: torch.Size([2, 28, 28])
attribution shape: torch.Size([2, 28])
tensor([[0.0083, 1.0023, 1.0081, 1.0121, 1.0034, 1.0085, 1.0153, 0.9970, 1.0164,
         1.0129, 1.0107, 1.0105, 0.9967, 0.9881, 1.0724, 1.0293, 1.0093, 0.9989,
         1.0108, 1.0072, 1.0074, 1.0221, 0.9954, 0.9950, 0.9916, 1.0036, 1.0057,
         1.0014],
        [0.0643, 1.0583, 1.0641, 1.0681, 1.0594, 1.0645, 1.0713, 1.0530, 1.0724,
         1.0689, 1.0667, 1.0665, 1.0527, 1.0441, 1.1284, 1.0853, 1.0653, 1.0549,
         1.0668, 1.0632, 1.0634, 1.0781, 1.0514, 1.0510, 1.0476, 1.0596, 1.0617,
   




new method with list: 
attentions_mat shape: torch.Size([3, 12, 12, 12])
attentions_grads shape: torch.Size([3, 12, 12, 12])
attentions_mat after mean over heads shape: torch.Size([3, 12, 12])
joint_attentions shape: torch.Size([12, 12, 12])
joint_attentions after zeroing CLS-to-CLS shape: torch.Size([12, 12, 12])
attribution shape: torch.Size([12])

old method: 
attentions_mat shape: (3, 12, 12, 12)
attentions_grads shape: (3, 12, 12, 12)
attentions_mat after grad mul shape: torch.Size([3, 12, 12, 12])
attentions_mat after mean over heads shape: torch.Size([3, 12, 12])
joint_attentions shape: torch.Size([12, 12, 12]) # this is because of faulty expansion as it expects bs in dim 2 
-- they both produce same output 

In [18]:
og_method = False

In [59]:
joint_attentions.shape

torch.Size([12, 12, 12])

In [None]:
# with open('/home/tromanski/thesis/data/GAE_attr/OG_val_bert_sst.json', 'w') as f:
#     json.dump(results, f)

In [14]:
attribution # for fourth sample which is 28 tokens 

tensor([0.0083, 1.0023, 1.0081, 1.0121, 1.0034, 1.0085, 1.0153, 0.9970, 1.0164,
        1.0129, 1.0107, 1.0105, 0.9967, 0.9881, 1.0724, 1.0293, 1.0093, 0.9989,
        1.0108, 1.0072, 1.0074, 1.0221, 0.9954, 0.9950, 0.9916, 1.0036, 1.0057,
        1.0014], grad_fn=<SumBackward1>)

In [38]:
attribution_og

tensor([0.0094, 1.0098, 1.0037, 1.0117, 1.0120, 1.0743, 1.0155, 1.0359, 1.0290,
        1.0241, 1.0095, 1.0017])

In [27]:
attribution

tensor([0.0094, 1.0098, 1.0037, 1.0117, 1.0120, 1.0743, 1.0155, 1.0359, 1.0290,
        1.0241, 1.0095, 1.0017], grad_fn=<SumBackward1>)

In [25]:
attribution

tensor([0.0094, 1.0098, 1.0037, 1.0117, 1.0120, 1.0743, 1.0155, 1.0359, 1.0290,
        1.0241, 1.0095, 1.0017], grad_fn=<SumBackward1>)

## Using my impl

In [None]:
print()

: 

In [26]:
from utils.generic import load_custom_bert


print('1')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = load_custom_bert(device=device, finetuned=True, explain=False, train=True)
model.train(False) # this is important!! why? dropout etc I assume 
print('2')
x = test_data_loader[0]
results = {'losses':[], 'attributions':[]}
# model.explain()
import tqdm
for x in tqdm.tqdm(test_data_loader):
    input_ids = torch.tensor(np.float32(x['input_ids']) , requires_grad=True).unsqueeze(0).long().to(device)
    attention_mask = torch.tensor(np.float32(x['attention_mask']), requires_grad=True).unsqueeze(0).long().to(device)
    token_type_ids = torch.tensor(np.float32(x['token_type_ids'])).unsqueeze(0).long().to(device)
    words = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
    y_true = torch.tensor(x['label']).to(device)
    labels_in = torch.tensor([int(y_true)]*len(input_ids)).long().to(device)
    # print('1')
    outs = model(input_ids=input_ids,
                                labels = labels_in)
    # print('2')
    loss = outs['loss'].detach().cpu().numpy()
    y_pred = np.argmax(outs['logits'].squeeze().detach().cpu().numpy())

    gammas = [0.00,0.00, 0.00]
    # print('3')
    outs = model.forward_and_explain(input_ids=input_ids, cl=y_true,
                    labels = labels_in , method='GAE')
    # print('4')

    # attns = [model.attention_probs[k] for k in sorted(model.attention_probs.keys())]
    # attentions_mat = torch.stack(attns, dim=0).squeeze()

    # grads = [model.attention_gradients[k] for k in sorted(model.attention_gradients.keys())]
    # attentions_grads = torch.stack(grads, dim=0).squeeze()

    # # 2) Compute your attention * grad, staying entirely in torch
    # attentions_mat = (attentions_mat * attentions_grads).clamp(min=0)
    # attentions_mat = attentions_mat.clamp(min=0).mean(dim=1)

    # # 3) rollout must be pure torch ops
    # joint_attentions = _compute_rollout_attention(attentions_mat)  # must return a Tensor from torch ops only

    # # 4) Avoid accidentally disconnecting the graph; in-place is OK on non-leaf
    # joint_attentions = joint_attentions.clone()
    # joint_attentions[:, 0, 0] = 0

    # idx = 0
    # attribution = joint_attentions[idx].sum(0)  # still differentiable
    # results['attributions'].append(attribution.tolist())
    # results['losses'].append(loss.tolist())
    results['attributions'].append(outs['R'].tolist())
    results['losses'].append(outs['loss'].tolist())
    break
# attribution = outs['R'].squeeze()
#     results['attributions'].append(attribution)
#     results['losses'].append(loss)

1


loading configuration file config.json from cache at /vol/csedu-nobackup/project/tromanski/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /vol/csedu-nobackup/project/tromanski/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/

2


  0%|          | 0/872 [00:00<?, ?it/s]


In [24]:
attribution

tensor([0.0094, 1.0098, 1.0037, 1.0117, 1.0120, 1.0743, 1.0155, 1.0359, 1.0290,
        1.0241, 1.0095, 1.0017], grad_fn=<SumBackward1>)

## Testing 

In [53]:
with open('/home/tromanski/thesis/data/GAE_attr/OG_val_bert_sst.json', 'r') as f:
    results_og = json.load(f)

In [55]:
og_atts = results_og.get('attributions', [])
new_atts = results.get('attributions', [])

print(f"len(results_og) = {len(og_atts)}, len(results) = {len(new_atts)}")

mismatches = []
for i, (a_og, a_new) in enumerate(zip(og_atts, new_atts)):
    a_og = np.asarray(a_og, dtype=float)
    a_new = np.asarray(a_new, dtype=float)
    if a_og.shape != a_new.shape or not np.allclose(a_og, a_new, rtol=1e-5, atol=1e-8):
        maxdiff = float(np.max(np.abs(a_og - a_new))) if a_og.shape == a_new.shape else None
        mismatches.append((i, a_og.shape, a_new.shape, maxdiff))
        if len(mismatches) <= 10:
            print(f"Mismatch idx={i} shapes={a_og.shape} vs {a_new.shape} maxdiff={maxdiff}")

if len(og_atts) != len(new_atts):
    print("Different number of attribution entries. Extra indices in results_og:",
          list(range(len(new_atts), len(og_atts))) if len(og_atts) > len(new_atts) else [])
    print("Extra indices in results:",
          list(range(len(og_atts), len(new_atts))) if len(new_atts) > len(og_atts) else [])

if not mismatches:
    print("All attributions match (within tolerance).")
else:
    print(f"Total mismatches: {len(mismatches)} (showing up to 10).")

# keep mismatches for inspection
mismatches

len(results_og) = 872, len(results) = 872
All attributions match (within tolerance).


[]

In [None]:
from utils.generic import load_custom_bert


print('1')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = load_custom_bert(device=device, finetuned=True, explain=True, train=False)
print('2')
x = test_data_loader[0]
results = {'losses':[], 'attributions':[]}
model.explain()
# for x in test_data_loader:
    

input_ids = torch.tensor(np.float32(x['input_ids']) , requires_grad=True).unsqueeze(0).long().to(device)
attention_mask = torch.tensor(np.float32(x['attention_mask']), requires_grad=True).unsqueeze(0).long().to(device)
token_type_ids = torch.tensor(np.float32(x['token_type_ids'])).unsqueeze(0).long().to(device)
words = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
y_true = torch.tensor(x['label']).to(device)
labels_in = torch.tensor([int(y_true)]*len(input_ids)).long().to(device)
print('1')
outs = model(input_ids=input_ids,
                            labels = labels_in)
print('2')
loss = outs['loss'].detach().cpu().numpy()
y_pred = np.argmax(outs['logits'].squeeze().detach().cpu().numpy())

gammas = [0.00,0.00, 0.00]
print('3')
outs = model.forward_and_explain(input_ids=input_ids, cl=y_true,
                            labels = labels_in, 
                                gammas = gammas)
print('4')


attribution = outs['R'].squeeze()
#     results['attributions'].append(attribution)
#     results['losses'].append(loss)

In [None]:
attns = [model.attention_probs[k].detach().cpu().numpy() for k in sorted(model.attention_probs.keys())]
attentions_mat = np.stack(attns,axis=0).squeeze()
attns = [model.attention_gradients[k].detach().cpu().numpy() for k in sorted(model.attention_gradients.keys())]
attentions_grads = np.stack(attns,axis=0).squeeze()
attentions_mat = torch.tensor(attentions_mat * attentions_grads).clamp(min=0)
attentions_mat = torch.tensor(attentions_mat).clamp(min=0).mean(dim=1)
joint_attentions = _compute_rollout_attention(attentions_mat)
joint_attentions[:, 0, 0] = 0
idx = 0
attribution = joint_attentions[idx].sum(0)

## Random 

In [32]:
!jobs/single_attack.sh

Starting attack.sh on cn84 at Tue Dec  9 04:17:08 PM CET 2025
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Model name: custom-bert-finetuned
Tokenizer name: google-bert/bert-base-uncased
Dataset: sst2
Approach: location
Position target: 1
Loss function: rank
Lambda: 10.0
Epochs: 4
Project dir: /home/tromanski/thesis
Model dir: /vol/csedu-nobackup/project/tromanski
Eval only: True
Subsample size: None
Get attributions: True
Learning rate: 1e-05
Optimizer: adamw
Scheduler type: linear
Warmup percent: 0.1
Batch size: 32
Seed: 42
loading configuration file config.json from cache at /vol/csedu-nobackup/project/tromanski/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hi

In [30]:
!jobs/single_attack.sh

Starting attack.sh on cn84 at Tue Dec  9 03:57:36 PM CET 2025
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Model name: custom-bert-finetuned
Tokenizer name: google-bert/bert-base-uncased
Dataset: sst2
Approach: location
Position target: 1
Loss function: rank
Lambda: 10.0
Epochs: 4
Project dir: /home/tromanski/thesis
Model dir: /vol/csedu-nobackup/project/tromanski
Eval only: True
Subsample size: None
Get attributions: True
Learning rate: 1e-05
Optimizer: adamw
Scheduler type: linear
Warmup percent: 0.1
Batch size: 32
Seed: 42
loading configuration file config.json from cache at /vol/csedu-nobackup/project/tromanski/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hi