In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
import transformers

In [5]:
from transformers import BertModel,BertTokenizer
# create a BERT model
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# get the list of layers
layers = list(model.encoder.layer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
layers

[BertLayer(
   (attention): BertAttention(
     (self): BertSelfAttention(
       (query): Linear(in_features=768, out_features=768, bias=True)
       (key): Linear(in_features=768, out_features=768, bias=True)
       (value): Linear(in_features=768, out_features=768, bias=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (output): BertSelfOutput(
       (dense): Linear(in_features=768, out_features=768, bias=True)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
   )
   (intermediate): BertIntermediate(
     (dense): Linear(in_features=768, out_features=3072, bias=True)
     (intermediate_act_fn): GELUActivation()
   )
   (output): BertOutput(
     (dense): Linear(in_features=3072, out_features=768, bias=True)
     (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
 ),
 BertLayer(
   (attention): BertAttention(
     (self): 

****
## Checking the dataset

In [7]:
import pandas as pd

cola_dataset=pd.read_csv('/Users/yashsurange/adapter_bert/cola_dataset/raw/in_domain_train.tsv',sep='\t')
cola_dataset.columns=['source','acceptibility','originality','sentence']

In [7]:
cola_dataset.head()

Unnamed: 0,source,acceptibility,originality,sentence
0,gj04,1,,One more pseudo generalization and I'm giving up.
1,gj04,1,,One more pseudo generalization or I'm giving up.
2,gj04,1,,"The more we study verbs, the crazier they get."
3,gj04,1,,Day by day the facts are getting murkier.
4,gj04,1,,I'll fix you a drink.


In [8]:
cola_dataset[['acceptibility','sentence']].head()

Unnamed: 0,acceptibility,sentence
0,1,One more pseudo generalization and I'm giving up.
1,1,One more pseudo generalization or I'm giving up.
2,1,"The more we study verbs, the crazier they get."
3,1,Day by day the facts are getting murkier.
4,1,I'll fix you a drink.


In [9]:
cola_dataset.iloc[1]

source                                                       gj04
acceptibility                                                   1
originality                                                   NaN
sentence         One more pseudo generalization or I'm giving up.
Name: 1, dtype: object

****
## Checking tokenisation done in dataset.py

In [8]:
path='/Users/yashsurange/adapter_bert/cola_dataset/raw/in_domain_train.tsv'

In [9]:
from dataset import dataset_adapter

train_dataset_tokenised=dataset_adapter(path,tokenizer)

In [10]:
train_dataset_tokenised.data.head()

Unnamed: 0,acceptibility,sentence
0,1,"Our friends won't buy this analysis, let alone..."
1,1,One more pseudo generalization and I'm giving up.
2,1,One more pseudo generalization or I'm giving up.
3,1,"The more we study verbs, the crazier they get."
4,1,Day by day the facts are getting murkier.


In [None]:
# When max length is kept at 20. We can see padding i.e padding with zeros 
train_dataset_tokenised.__getitem__(0)['sentence']['input_ids']

tensor([[  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
          2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0]])

In [12]:
# Current implementation of getitem function. The max length has been fixed at 15
train_dataset_tokenised.__getitem__(0)

{'acceptibility': 1,
 'input_ids': tensor([[ 101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894,
          1996, 2279,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# From token ids to tokens
train_dataset_tokenised.tokenizer.convert_ids_to_tokens(train_dataset_tokenised.__getitem__(3)['sentence']['input_ids'].reshape([-1]))

['[CLS]',
 'the',
 'more',
 'we',
 'study',
 'verbs',
 ',',
 'the',
 'cr',
 '##azi',
 '##er',
 'they',
 'get',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

****

### Understanding how to extract the last hidden state

In [20]:
## Experimenting with engine.py file

from engine import BertClassifier

In [21]:
testing_bertclassifier=BertClassifier(10)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.2.output.adap

In [None]:
state=testing_bertclassifier(train_dataset_tokenised.__getitem__(0)['input_ids'])

In [36]:
# Accessing the last hidden state
# Since we passed a sequence of 15 tokens, we get a contextual representation of each of those tokens.
# Size of the representation is 768
state.last_hidden_state.shape

torch.Size([1, 15, 768])

In [38]:
# We have extracted the representations of the first token i.e is CLS token.
state.last_hidden_state[0,0,:].shape

torch.Size([768])

****
### Checking the trainer



In [3]:
from train import Trainer

In [4]:
trainer_test=Trainer()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.2.output.adap

In [5]:
trainer_test.loss_fct


BCEWithLogitsLoss()

In [57]:
# Since, we want to optimize only parameters of the adapter modules and layer normalization
l=["adapter", "LayerNorm"]
[n for n, p in trainer_test.model.named_parameters() if any([(nd in n) for nd in l])]


['bert_model.embeddings.LayerNorm.weight',
 'bert_model.embeddings.LayerNorm.bias',
 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight',
 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias',
 'bert_model.encoder.layer.0.attention.output.adapter_module.linear_down_projection.weight',
 'bert_model.encoder.layer.0.attention.output.adapter_module.linear_down_projection.bias',
 'bert_model.encoder.layer.0.attention.output.adapter_module.linear_up_projection.weight',
 'bert_model.encoder.layer.0.attention.output.adapter_module.linear_up_projection.bias',
 'bert_model.encoder.layer.0.output.LayerNorm.weight',
 'bert_model.encoder.layer.0.output.LayerNorm.bias',
 'bert_model.encoder.layer.0.output.adapter_module.linear_down_projection.weight',
 'bert_model.encoder.layer.0.output.adapter_module.linear_down_projection.bias',
 'bert_model.encoder.layer.0.output.adapter_module.linear_up_projection.weight',
 'bert_model.encoder.layer.0.output.adapter_module.linear_up_projection

In [61]:
for b in trainer_test.train_dataloader:
    #test_model=BertClassifier(1)
    # test_model(b['input_ids'])
    #print(b['input_ids'].squeeze(1).shape)
    #print(b['acceptibility'].shape)
    print(b['acceptibility'].shape)
    print(trainer_test.model(b['input_ids'].squeeze(1)).dtype)
    #print(b['input_ids'].shape)
    break

torch.Size([32])
torch.float32
