In [52]:
from omegaconf import OmegaConf
import re
import pandas as pd
import numpy as np
import warnings
from IPython.core.display import display
import torch

from torch import nn
from transformers import RobertaTokenizer, RobertaModel, DataCollatorWithPadding
from pipeline.datasets.rte_dataset import RTEDataset, RTETestDataset
from pipeline.models.roberta_classifier import RoBERTaClassifier
print(torch.__version__)
print(torch.cuda.is_available())
%load_ext autoreload
%autoreload 2

1.10.1+cu102
True
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
train_data = pd.read_json(path_or_buf='../data/RTE/train.jsonl', lines=True).set_index('idx')
val_data = pd.read_json(path_or_buf='../data/RTE/val.jsonl', lines=True).set_index('idx')
test_data = pd.read_json(path_or_buf='../data/RTE/test.jsonl', lines=True)#.set_index('idx')

display(train_data.head())
print(train_data.shape)

Unnamed: 0_level_0,premise,hypothesis,label
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,No Weapons of Mass Destruction Found in Iraq Yet.,Weapons of Mass Destruction Found in Iraq.,not_entailment
1,"A place of sorrow, after Pope John Paul II die...",Pope Benedict XVI is the new leader of the Rom...,entailment
2,Herceptin was already approved to treat the si...,Herceptin can be used to treat breast cancer.,entailment
3,"Judie Vivian, chief executive at ProMedica, a ...",The previous name of Ho Chi Minh City was Saigon.,entailment
4,A man is due in court later charged with the m...,Paul Stewart Hutchinson is accused of having s...,not_entailment


(2490, 3)


In [65]:
train_data['label'].value_counts()

entailment        1249
not_entailment    1241
Name: label, dtype: int64

In [69]:
from sklearn.utils import compute_class_weight


labels_map = {
            'not_entailment': 0,
            'entailment': 1,
        }

train_y = np.array(train_data['label'].values)

class_weights = compute_class_weight(
            class_weight='balanced',
            classes=['entailment', 'not_entailment'],
            y=train_y
        )
class_weights = list(class_weights)
class_weights

[0.9967974379503602, 1.0032232070910556]

In [68]:
list(set(train_y))

['entailment', 'not_entailment']

In [5]:
model = RobertaModel.from_pretrained("roberta-base")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [45]:
tokenizer.encode_plus(
    'No Weapons of Mass Destruction Found in Iraq Yet.',
    'Weapons of Mass Destruction Found in Iraq.',
    padding=False,
    truncation=True,
    max_length=150,
    add_special_tokens=True,
    return_attention_mask=True,
    return_token_type_ids=True,
    return_tensors='pt',
)

{'input_ids': tensor([[    0,  3084, 28054,     9,  5370, 43207, 11911,    11,  3345,  3507,
             4,     2,     2, 48637,     9,  5370, 43207, 11911,    11,  3345,
             4,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [44]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [27]:
loss = nn.CrossEntropyLoss()
input_tensor = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input_tensor, target)
output

tensor(1.9460, grad_fn=<NllLossBackward0>)

In [24]:
input_tensor

tensor([[ 0.5382,  0.6787,  0.8633, -0.4784, -0.4631],
        [-0.0049,  0.1128, -0.6147, -1.2283,  0.5026],
        [ 0.7810,  0.9470,  3.2067, -0.9198,  0.1095]], requires_grad=True)

In [25]:
target

tensor([1, 0, 1])

In [21]:
input_tensor.shape, target.shape

(torch.Size([3, 5]), torch.Size([3]))

In [28]:
loss = nn.CrossEntropyLoss()
input_tensor = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input_tensor, target)
output


tensor(1.8654, grad_fn=<DivBackward1>)

In [29]:
target

tensor([[0.0187, 0.3116, 0.3325, 0.0450, 0.2923],
        [0.1780, 0.5081, 0.0480, 0.1720, 0.0940],
        [0.0477, 0.4098, 0.0778, 0.0860, 0.3788]])

In [48]:
# loss = nn.BCEWithLogitsLoss(weight=torch.tensor([1.0, 2.0]))
# inputs = torch.randn(3, requires_grad=True)
# target = torch.empty(3).random_(2)
# output = loss(inputs, target)
# output

torch.Size([3])

In [14]:
# for row in train_data.values:
#     tokenizer.encode(row[0], row[1]
#     print(i)
#     1/0

In [6]:
dataset = RTEDataset(
    data=train_data.values,
    tokenizer=tokenizer,
    max_length=200,
)

dataset.__getitem__(1)

{'input_ids': tensor([    0,   250,   317,     9, 26130,     6,    71,  8509,   610,  1206,
          3082,   962,     6,  1059,    10,   317,     9,  4821,     6,    25,
          7733,  4019, 15828,  4366,    11,  3301,  1568,     7,  2458,     5,
          8809,     9,    92,  8509, 20742, 42171,     4,     2,     2, 36017,
         20742, 42171,    16,     5,    92,   884,     9,     5,  7733,  4019,
          2197,     4,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]),
 'label': 1}

In [56]:
dataset.__getitem__(1)['input_ids'].shape

torch.Size([53])

In [26]:
tokenizer.decode(dataset.__getitem__(1)['input_ids'])

'<s>Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations.</s></s>Bacteria is winning the war against antibiotics.</s>'

In [33]:
lens = []
for i in range(len(dataset)):
    lens.append(len(dataset.__getitem__(i)['input_ids']))

In [40]:
np.percentile(lens, 99)

196.0

In [41]:
max(lens)

292

In [7]:
collator = DataCollatorWithPadding(
            tokenizer=tokenizer,
            padding='longest',
        )

train_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=8,
            # num_workers=cfg.datamodule.num_workers,
            # pin_memory=cfg.datamodule.pin_memory,
            collate_fn=collator,
            shuffle=True,
        )

batch = next(iter(train_loader))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [89]:
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [48]:
batch['labels']

tensor([1, 0])

In [84]:
loss_cfg = OmegaConf.load('../cfg/loss/bce_logits.yaml')
loss_cfg.params.weight = [1.0, 2.0]
print(loss_cfg.params)

{'reduction': 'sum', 'weight': [1.0, 2.0]}


In [80]:
type(loss_cfg.params.weight[0])

float

In [35]:
model_cfg = OmegaConf.load('../cfg/model/roberta.yaml')
model_cfg['model'] = model_cfg

clf = RoBERTaClassifier(model_cfg)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
# tmp = dataset.__getitem__(1)
logits, probs = clf(**batch)
probs

tensor([[0.4469, 0.5531],
        [0.4527, 0.5473],
        [0.4516, 0.5484],
        [0.4506, 0.5494],
        [0.4499, 0.5501],
        [0.4493, 0.5507],
        [0.4465, 0.5535],
        [0.4492, 0.5508]], grad_fn=<SoftmaxBackward0>)

In [37]:
logits, logits.shape

(tensor([[-0.1301,  0.0831],
         [-0.1360,  0.0536],
         [-0.1255,  0.0689],
         [-0.1271,  0.0711],
         [-0.1330,  0.0680],
         [-0.1313,  0.0721],
         [-0.1313,  0.0835],
         [-0.1341,  0.0700]], grad_fn=<AddmmBackward0>),
 torch.Size([8, 2]))

In [41]:
batch['labels'], batch['labels'].shape

(tensor([0, 0, 1, 0, 0, 0, 1, 1]), torch.Size([8]))

In [39]:
probs = torch.softmax(logits, dim=1)
probs

tensor([[0.4469, 0.5531],
        [0.4527, 0.5473],
        [0.4516, 0.5484],
        [0.4506, 0.5494],
        [0.4499, 0.5501],
        [0.4493, 0.5507],
        [0.4465, 0.5535],
        [0.4492, 0.5508]], grad_fn=<SoftmaxBackward0>)

In [45]:
batch['labels'].float().unsqueeze(1)

tensor([[1.],
        [0.]])

In [42]:
loss(logits, batch['labels'])

tensor(0.7228, grad_fn=<NllLossBackward0>)

In [44]:
import torchmetrics

acc = torchmetrics.Accuracy()
acc(logits, batch['labels'])

tensor(0.3750)

In [47]:
import glob

model_names = glob.glob(f'../outputs/2022-03-21_09-38-12/saved_models/*')
best_model = [name for name in model_names if 'best' in name][0]
checkpoint = torch.load(best_model)
clf.load_state_dict(checkpoint)

<All keys matched successfully>

In [49]:
best_model

'../outputs/2022-03-21_09-38-12/saved_models/best_epoch=5-val_accuracy=0.7545.pth'

In [50]:
def convert_to_jit(model: nn.Module, save_name: str, cfg) -> None:
    input_shape = (1, 3, cfg.datamodule.main_image_size, cfg.datamodule.main_image_size)
    target_shape = 1
    # out_path = f'saved_models/{save_name}_jit.pt'
    model.eval()

    device = next(model.parameters()).device
    input_tensor = torch.ones(input_shape).float().to(device)
    target_tensor = torch.ones(target_shape, dtype=torch.long).to(device)
    traced_model = torch.jit.trace(model, (input_tensor, target_tensor))
    torch.jit.save(traced_model, save_name)


convert_to_jit(clf, '../outputs/2022-03-21_09-38-12/saved_models/best_model_jit.pt', model_cfg)

# m = torch.jit.script(clf)
#
# # Save to file
# torch.jit.save(m, '../outputs/2022-03-21_09-38-12/saved_models/best_model_jit.pt')
# This line is equivalent to the previous
# m.save("scriptmodule.pt")

NotSupportedError: Compiled functions can't take variable number of arguments or use keyword-only arguments with defaults:
  File "/home/nbaranov/projects/personal/nlp_template/pipeline/models/roberta_classifier.py", line 24
        input_ids,
        attention_mask=None,
        *args, **kwargs
                ~~~~~~~ <--- HERE
    ):
        model_output = self.model(


In [72]:
test_data.values[0]

array(["Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case.",
       'Shukla is related to Mangla.', 0], dtype=object)

In [73]:
test_dataset = RTETestDataset(data=test_data.values[:10], tokenizer=tokenizer, max_length=512)

test_dataset.__getitem__(0)

{'idx': 0,
 'input_ids': tensor([    0,   448,  1097,  2560,    21, 17323,    71,  4145, 18257,  3119,
            18,  2761, 40371,  3592,   840,  1350,  2560,     6,    54,    21,
             5,    78,  4562,    11,     5,   403,     4,     2,     2,  3609,
          1350,  2560,    16,  1330,     7, 12756,  2560,     4,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [74]:
test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=64,
            # num_workers=self.cfg.datamodule.num_workers,
            # pin_memory=self.cfg.datamodule.pin_memory,
            collate_fn=collator,
            shuffle=False,
        )

In [75]:
def get_test_scores(model, test_dataloader):
    # torch.set_grad_enabled(False)
    model.eval()
    all_preds = []
    all_probs = []
    with torch.inference_mode():
        for batch in test_dataloader:
            logits = model(**batch)
            probs = torch.softmax(logits, dim=1)
            y_prob, y_pred = torch.max(probs, dim=1)
            all_preds.extend(y_pred.numpy())
            all_probs.extend(probs.detach().cpu().numpy())
    return all_preds, all_probs


all_preds, all_probs = get_test_scores(clf, test_loader)


In [94]:
def make_submission(test_dataset, test_preds):
    """
    {"idx": 12, "label": "not_entailment"}
    {"idx": 13, "label": "entailment"}
    """
    tag2label = {v: k for k,v in test_dataset.labels_map.items()}
    submission = []
    for idx in [i[-1] for i in test_dataset.data]:
        submission.append({'idx': idx, 'label': tag2label[test_preds[idx]]})

    return submission

[{'idx': 0, 'label': 'entailment'},
 {'idx': 1, 'label': 'entailment'},
 {'idx': 2, 'label': 'entailment'},
 {'idx': 3, 'label': 'entailment'},
 {'idx': 4, 'label': 'entailment'},
 {'idx': 5, 'label': 'not_entailment'},
 {'idx': 6, 'label': 'entailment'},
 {'idx': 7, 'label': 'not_entailment'},
 {'idx': 8, 'label': 'not_entailment'},
 {'idx': 9, 'label': 'entailment'}]

In [97]:
len(test_loader.dataset.data)

10

In [109]:
model.device

device(type='cpu')

In [116]:
from datasets import load_dataset

dataset = load_dataset('super_glue', 'rte', cache_dir='../data/')

Reusing dataset super_glue (../data/super_glue/rte/1.0.2/2fb163bca9085c1deb906aff20f00c242227ff704a4e8c9cfdfe820be3abfc83)


In [117]:
dataset['test']

Dataset({
    features: ['premise', 'hypothesis', 'idx', 'label'],
    num_rows: 3000
})

In [118]:
print(next(iter(dataset['test'])))

{'hypothesis': 'Shukla is related to Mangla.', 'idx': 0, 'label': -1, 'premise': "Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case."}


In [123]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
type(device)

torch.device