In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import transformers

In [4]:
from transformers import BertModel,BertTokenizer
# create a BERT model
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# get the list of layers
layers = list(model.encoder.layer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


****
## Checking the dataset

In [5]:
import pandas as pd

cola_dataset=pd.read_csv('/Users/yashsurange/adapter_bert/cola_dataset/raw/in_domain_train.tsv',sep='\t')
cola_dataset.columns=['source','acceptibility','originality','sentence']

In [6]:
cola_dataset.head()

Unnamed: 0,source,acceptibility,originality,sentence
0,gj04,1,,One more pseudo generalization and I'm giving up.
1,gj04,1,,One more pseudo generalization or I'm giving up.
2,gj04,1,,"The more we study verbs, the crazier they get."
3,gj04,1,,Day by day the facts are getting murkier.
4,gj04,1,,I'll fix you a drink.


In [7]:
cola_dataset[['acceptibility','sentence']].head()

Unnamed: 0,acceptibility,sentence
0,1,One more pseudo generalization and I'm giving up.
1,1,One more pseudo generalization or I'm giving up.
2,1,"The more we study verbs, the crazier they get."
3,1,Day by day the facts are getting murkier.
4,1,I'll fix you a drink.


In [8]:
cola_dataset.iloc[1]

source                                                       gj04
acceptibility                                                   1
originality                                                   NaN
sentence         One more pseudo generalization or I'm giving up.
Name: 1, dtype: object

****
## Checking tokenisation done in dataset.py

In [9]:
path='/Users/yashsurange/adapter_bert/cola_dataset/raw/in_domain_train.tsv'

In [21]:
from dataset import dataset_adapter

train_dataset_tokenised=dataset_adapter(path,tokenizer)

In [11]:
train_dataset_tokenised.data.head()

Unnamed: 0,acceptibility,sentence
0,1,"Our friends won't buy this analysis, let alone..."
1,1,One more pseudo generalization and I'm giving up.
2,1,One more pseudo generalization or I'm giving up.
3,1,"The more we study verbs, the crazier they get."
4,1,Day by day the facts are getting murkier.


In [20]:
# When maxlength was kept at 15
train_dataset_tokenised.__getitem__(0)['sentence']['input_ids']

tensor([[ 101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894,
         1996, 2279,  102]])

In [22]:
# When max length is kept at 10
train_dataset_tokenised.__getitem__(0)['sentence']['input_ids']

tensor([[ 101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106,  102]])

In [14]:
# When max length is kept at 20. We can see padding i.e padding with zeros 
train_dataset_tokenised.__getitem__(0)['sentence']['input_ids']

tensor([[  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
          2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0]])

In [15]:
# From token ids to tokens
train_dataset_tokenised.tokenizer.convert_ids_to_tokens(train_dataset_tokenised.__getitem__(3)['sentence']['input_ids'].reshape([-1]))

['[CLS]',
 'the',
 'more',
 'we',
 'study',
 'verbs',
 ',',
 'the',
 'cr',
 '##azi',
 '##er',
 'they',
 'get',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

****

In [37]:
def get_train_dataloader(train_path, tokenizer, batch_size):
    train_dataset = dataset_adapter(train_path, tokenizer)
    #train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size
    )
    return train_dataloader

In [38]:
test=get_train_dataloader(path,tokenizer,100)

In [39]:
for b in test:
    print(b)
    break

{'acceptibility': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
        0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        1, 1, 1, 1]), 'input_ids': tensor([[[ 101, 2256, 2814,  ..., 1996, 2279,  102]],

        [[ 101, 2028, 2062,  ..., 1012,  102,    0]],

        [[ 101, 2028, 2062,  ..., 1012,  102,    0]],

        ...,

        [[ 101, 4862, 4143,  ...,    0,    0,    0]],

        [[ 101, 1996, 2125,  ...,    0,    0,    0]],

        [[ 101, 1996, 2062,  ...,    0,    0,    0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[