In [3]:
import ujson as json
from transformers import AutoTokenizer, RobertaModel
with open("dataset/docred/rel_info.json", "r") as fh:
        data = json.load(fh)

docred_rel2id = json.load(open('meta/rel2id.json', 'r'))

In [37]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

In [38]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [41]:
id_rels = [(docred_rel2id[d], d, data[d]) for d in data]
id_rels.sort(key=lambda x: x[0])
id_rels = [(0, "", 'no relation')] + id_rels
rels = [["*"]+tokenizer.tokenize(item[2])+["*"] for item in id_rels]
rels, rel_pos = [], []
for item in id_rels:
    rel_pos.append(len(rels)+1)
    rels.extend(["*"]+tokenizer.tokenize(item[2])+["*"])

input_ids = tokenizer.convert_tokens_to_ids(rels)
input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)

In [52]:
print(len(id_rels))
print(len(input_ids))

97
453


In [44]:
print(id_rels)
print(rels)
print(rel_pos)
print(input_ids)

[(0, '', 'no relation'), (1, 'P17', 'country'), (2, 'P131', 'located in the administrative territorial entity'), (3, 'P27', 'country of citizenship'), (4, 'P150', 'contains administrative territorial entity'), (5, 'P577', 'publication date'), (6, 'P175', 'performer'), (7, 'P569', 'date of birth'), (8, 'P570', 'date of death'), (9, 'P161', 'cast member'), (10, 'P264', 'record label'), (11, 'P527', 'has part'), (12, 'P361', 'part of'), (13, 'P495', 'country of origin'), (14, 'P19', 'place of birth'), (15, 'P571', 'inception'), (16, 'P54', 'member of sports team'), (17, 'P102', 'member of political party'), (18, 'P463', 'member of'), (19, 'P3373', 'sibling'), (20, 'P40', 'child'), (21, 'P30', 'continent'), (22, 'P50', 'author'), (23, 'P1441', 'present in work'), (24, 'P1001', 'applies to jurisdiction'), (25, 'P69', 'educated at'), (26, 'P26', 'spouse'), (27, 'P607', 'conflict'), (28, 'P57', 'director'), (29, 'P159', 'headquarters location'), (30, 'P22', 'father'), (31, 'P400', 'platform')

In [6]:
import torch
relinfo_features = torch.load("dataset/docred/rel_info.json.roberta-large.pt")

In [57]:
len(a["input_ids"])

453

In [1]:
import torch
import torch.nn as nn
from opt_einsum import contract
from long_seq import process_long_input
from losses import ATLoss
import torch.nn.functional as F
import numpy as np

class MultiHeadAttention(nn.Module):
    def __init__(self, query_dim, key_dim, all_head_dim, num_heads):
        super().__init__()
        self.query_dim = query_dim
        self.key_dim = key_dim
        self.all_head_dim = all_head_dim
        self.num_heads = num_heads
        
 
        self.W_query = nn.Linear(in_features=query_dim, out_features=all_head_dim)
        self.W_key = nn.Linear(in_features=key_dim, out_features=all_head_dim)
        self.W_value = nn.Linear(in_features=key_dim, out_features=all_head_dim)
 
    def forward(self, query, key, mask=None):
        querys = self.W_query(query)  # [B, N_q, all_head_dim]
        keys = self.W_key(key)  # [B, N_k, all_head_dim]
        values = self.W_value(key)
 
        head_size = self.all_head_dim // self.num_heads
        querys = torch.stack(torch.split(querys, head_size, dim=2), dim=0)  # [h, B, N_q, all_head_dim/h]
        keys = torch.stack(torch.split(keys, head_size, dim=2), dim=0)  # [h, B, N_k, all_head_dim/h]
        values = torch.stack(torch.split(values, head_size, dim=2), dim=0)  # [h, B, N_k, all_head_dim/h]
 
        ## score = softmax(QK^T / (d_k ** 0.5))
        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, B, N_q, N_k]
        scores = scores / (self.key_dim ** 0.5)
 
        ## mask
        if mask is not None:
            ## mask:  [B, N_k] --> [h, B, N_q, N_k]
            mask = mask.unsqueeze(1).unsqueeze(0).repeat(self.num_heads, 1, querys.shape[2], 1)
            scores = scores.masked_fill(mask!=1, -np.inf)
        scores = F.softmax(scores, dim=3)
 
        ## out = score * V
        out = torch.matmul(scores, values)  # [h, B, N_q, all_head_dim/h]
        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [B, N_q, all_head_dim]
 
        return out, scores

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
attention = MultiHeadAttention(3,4,5,1)
 
## 输入
qurry = torch.randn(8, 2, 3)
key = torch.randn(8, 6 ,4)
mask = torch.tensor([[False, False, False, False, True, True],
                     [False, False, False, True, True, True],
                     [False, False, False, False, True, True],
                     [False, False, False, True, True, True],
                     [False, False, False, False, True, True],
                     [False, False, False, True, True, True],
                     [False, False, False, False, True, True],
                     [False, False, False, True, True, True],])


out, scores = attention(qurry, key, mask)

In [5]:
from transformers import AutoConfig, AutoModel, AutoTokenizer, RobertaForTokenClassification
config = AutoConfig.from_pretrained(
        "roberta-large",
        num_labels=97,
    )
model = AutoModel.from_pretrained(
        "roberta-large",
        from_tf=False,
        config=config,
    )

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
output = model(
            input_ids=torch.tensor(relinfo_features['input_ids'], dtype=torch.long).unsqueeze(0),
            attention_mask=torch.ones(1, len(relinfo_features['input_ids']), dtype=torch.float),
            output_attentions=True,
        )

In [8]:
model(
            input_ids=torch.tensor(relinfo_features['input_ids'], dtype=torch.long).unsqueeze(0),
            attention_mask=torch.ones(1, len(relinfo_features['input_ids']), dtype=torch.float),
            output_attentions=True,
        )

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0980, -0.0350, -0.0102,  ...,  0.0159,  0.0371,  0.1192],
         [-0.0497,  0.3024, -0.4512,  ...,  0.6732, -0.2505,  0.2125],
         [ 0.1111,  0.0217, -0.2229,  ...,  0.2840,  0.1918,  0.0486],
         ...,
         [-0.0379,  0.1685, -0.4055,  ..., -0.2023,  0.1356, -0.6966],
         [-0.0897, -0.0601, -0.8137,  ...,  0.2174, -0.4425, -0.2108],
         [-0.0624, -0.0574, -0.0145,  ..., -0.0694,  0.0351,  0.0734]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.1766,  0.6905,  0.4712,  ..., -0.0089,  0.3840, -0.4286]],
       grad_fn=<TanhBackward0>), hidden_states=None, past_key_values=None, attentions=(tensor([[[[3.2589e-04, 1.9685e-03, 1.4126e-03,  ..., 3.3757e-04,
           1.2362e-03, 4.4026e-03],
          [4.3062e-03, 6.3385e-03, 2.1231e-02,  ..., 2.1439e-04,
           4.0811e-05, 2.3417e-05],
          [9.4582e-03, 1.8193e-02, 6.0712e-03,  ..., 1.4886e-04,
           1