In [1]:
from modeling_rmt import RMTEncoderForSequenceClassification
import torch
import json

from pathlib import Path
from torch.utils.data import DataLoader, DistributedSampler, Dataset
import numpy as np

from transformers import AutoConfig, AutoTokenizer, HfArgumentParser, AutoModel
from matplotlib import pyplot as plt

In [9]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = {
    'num_mem_tokens': 10,
    'input_size': 512,
    # 'input_seg_size': args.input_seg_size,
    'model_attr': 'deberta',
    # 'backbone_cls': backbone_cls,
    'bptt_depth': -1, 
    'pad_token_id': 0,
    'cls_token_id': tokenizer.cls_token_id, 
    'sep_token_id': tokenizer.sep_token_id,
    'eos_token_id': 102,
    "data_path": "data/test.jsonl",
    "batch_size": 4,
    "gradient_accumulation_steps": 2,
    "data_n_workers": 1,
}
labels_map = {'false': 0, 'true': 1}
encode_plus_kwargs = {'max_length': config["input_size"],
                              'truncation': True,
                              'padding': 'longest',
                              'pad_to_multiple_of': 1}

In [3]:
class HyperpartisanDataset(Dataset):
    def __init__(self, datafile, x_field='text', label_field='label'):
        if isinstance(datafile, str):
            # convert str path to folder to Path
            datafile = Path(datafile)
        self.data = []
        for line in datafile.open('r'):
            self.data += [json.loads(line)]
        self.x_field = x_field
        self.label_field = label_field

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.x_field]
        label = self.data[idx][self.label_field]
        return x, label

In [4]:
def collate_fn(batch):
    inputs, labels = zip(*batch)
    features = tokenizer.batch_encode_plus(list(inputs), return_tensors='pt', **encode_plus_kwargs)
    labels = np.array([labels_map[t] for t in labels])
    labels = {'labels': torch.from_numpy(labels)}
    return {**features, **labels}

In [7]:
load_path = "deberta/run_1/model_best.pth"
checkpoint = torch.load(load_path, map_location='cuda')
model = RMTEncoderForSequenceClassification.from_pretrained(model_name)
model.set_params(num_mem_tokens=10, 
                input_size=config["input_size"],
                # input_seg_size=1002,
                model_attr=config["model_attr"],
                # backbone_cls=transformers.BartForConditionalGeneration,
                bptt_depth=-1, 
                pad_token_id=tokenizer.pad_token_id,
                cls_token_id=tokenizer.cls_token_id, 
                sep_token_id=tokenizer.sep_token_id,
                eos_token_id=config["eos_token_id"],)
model.load_state_dict(checkpoint["model_state_dict"], strict=False)
model.cuda()
model.zero_grad()

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [17]:
model.to("cpu")

In [10]:
per_worker_batch_size = config["batch_size"] * config["gradient_accumulation_steps"]
kwargs = {'pin_memory': True, 'num_workers': config["data_n_workers"]}
test_data_path = Path(config["data_path"]).expanduser().absolute()
test_dataset = HyperpartisanDataset(test_data_path)
test_sampler = DistributedSampler(test_dataset, shuffle=False, rank=0, num_replicas=1)
test_dataloader = DataLoader(test_dataset, batch_size=per_worker_batch_size, sampler=test_sampler,
                                collate_fn=collate_fn, **kwargs)

In [18]:
for batch in test_dataloader:
    pred = model(**batch,
        # input_ids=batch["input_ids"].cuda(), 
        # token_type_ids=batch["token_type_ids"].cuda(),
        # attention_mask=batch["attention_mask"].cuda(), 
        # labels=batch["labels"].cuda(),
        output_attentions=True
        )
    break

In [26]:
pred["attentions"][0].shape

torch.Size([8, 12, 512, 512])

In [None]:
tokens = [tokenizer.convert_ids_to_tokens(t_id).replace('▁', '') for t_id in batch["input_ids"][0].tolist()]
attentions = [layer_atts[0].detach().squeeze().numpy() if layer_atts[0] is not None else None for layer_atts in pred['encoder_attentions']]