In [23]:
from torch import nn
import torch
from transformers import BertTokenizerFast,BertModel,BertConfig
import torch.nn.functional as F
from torch.optim import Adam


class NerModel(nn.Module):
    bio_unique_labels =[
        'B-dis', 'B-sym', 'B-pro', 'B-equ', 'B-dru', 
        'B-ite', 'B-bod', 'B-dep', 'B-mic', 'I-dis', 
        'I-sym', 'I-pro', 'I-equ', 'I-dru', 'I-ite', 
        'I-bod', 'I-dep', 'I-mic', 'O-dis', 'O-sym', 
        'O-pro', 'O-equ', 'O-dru', 'O-ite', 'O-bod', 
        'O-dep', 'O-mic', 'E-dis', 'E-sym', 'E-pro', 
        'E-equ', 'E-dru', 'E-ite', 'E-bod', 'E-dep', 
        'E-mic', 'S-dis', 'S-sym', 'S-pro', 'S-equ', 
        'S-dru', 'S-ite', 'S-bod', 'S-dep', 'S-mic', 
        'O']
    def __init__(self,dropout= 0.1,predict=False):
        super(NerModel, self).__init__()
        self.__predict = predict
        self.__pretained_path = "./premodels/roberta-large-chinese/"
        self.__bert = BertModel.from_pretrained(self.__pretained_path)
        self.__bert_config = BertConfig.from_pretrained(self.__pretained_path)
        self.__hidden_size = self.__bert.config.hidden_size
        self.__dropout = nn.Dropout(dropout)
        self.__classifier = nn.Linear(in_features=self.__hidden_size,out_features=len(self.bio_unique_labels),bias=True)
        self.__softmax = nn.Softmax
        self.__torchmax = torch.max
        return
    
    def is_predict(self,predict:bool):
        self.__predict = predict
        return
        
    def forward(self,input_ids,attention_mask):
        with torch.no_grad():
            token_series = self.__bert(input_ids,attention_mask)[0]
        token_series = self.__dropout(token_series)
        token_series = self.__classifier(token_series)
        batch_size, seq_len, ner_class_num = token_series.shape
        logits = token_series.view(
            (batch_size * seq_len, ner_class_num))
        return token_series if self.__predict else logits
      

In [24]:
model = NerModel()
model.train()

NerModel(
  (_NerModel__bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elem

In [25]:
batch_size = 32
num_epoch = 30  
learning_rate = 1e-5  

In [26]:
optimizer = Adam(model.parameters(), learning_rate) 
CE_loss = nn.CrossEntropyLoss()

In [27]:
#load train & val data
from train import handle_raw_data
from torch.utils.data import Dataset ,DataLoader
class NerDataset(Dataset):
    def __init__(self,pd):
        super().__init__()
        self.__dataset = pd
        self.__data_size = len(pd)        
    def __getitem__(self, index:int):
        return (self.__dataset.loc[index,"W"], self.__dataset.loc[index,"B"]) 
        
    def __len__(self):
        return self.__data_size


In [28]:
unique_labels = [
    "dis",
    "sym",
    "pro",
    "equ",
    "dru",
    "ite",
    "bod",
    "dep",
    "mic",
]

bio_unique_labels = []
bio_unique_labels.extend([ "B-"+l for l in unique_labels])
bio_unique_labels.extend([ "I-"+l for l in unique_labels])
bio_unique_labels.extend([ "O-"+l for l in unique_labels])
bio_unique_labels.extend([ "E-"+l for l in unique_labels])
bio_unique_labels.extend([ "S-"+l for l in unique_labels])
bio_unique_labels.append("O")

labels_to_ids = {k: v for v, k in enumerate(sorted(bio_unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(bio_unique_labels))}
print(labels_to_ids)

{'B-bod': 0, 'B-dep': 1, 'B-dis': 2, 'B-dru': 3, 'B-equ': 4, 'B-ite': 5, 'B-mic': 6, 'B-pro': 7, 'B-sym': 8, 'E-bod': 9, 'E-dep': 10, 'E-dis': 11, 'E-dru': 12, 'E-equ': 13, 'E-ite': 14, 'E-mic': 15, 'E-pro': 16, 'E-sym': 17, 'I-bod': 18, 'I-dep': 19, 'I-dis': 20, 'I-dru': 21, 'I-equ': 22, 'I-ite': 23, 'I-mic': 24, 'I-pro': 25, 'I-sym': 26, 'O': 27, 'O-bod': 28, 'O-dep': 29, 'O-dis': 30, 'O-dru': 31, 'O-equ': 32, 'O-ite': 33, 'O-mic': 34, 'O-pro': 35, 'O-sym': 36, 'S-bod': 37, 'S-dep': 38, 'S-dis': 39, 'S-dru': 40, 'S-equ': 41, 'S-ite': 42, 'S-mic': 43, 'S-pro': 44, 'S-sym': 45}


In [29]:
from transformers import BertTokenizer
import numpy as np
def my_coffate_fn(batch):
    sentences, labels = [] , []
    tokenizer = BertTokenizer.from_pretrained("./premodels/roberta-large-chinese/")
    for (s , l)  in batch:
        token_series = tokenizer.encode_plus(
            list(s),
            padding='max_length',
            max_length=512,
            truncation=True,
            return_tensors="np",
            is_split_into_words=False
        )
        model_inputs = (token_series["input_ids"],token_series["attention_mask"])
        l.insert(0,"O")
        l.extend(["O"]*(512-len(l)))
        if len(l) != 512:
            print(len(l))
            raise BaseException 
        labels_series = []
        labels_series.extend([labels_to_ids[i] for i in l ])
        if len(labels_series) != 512:
            raise BaseException 
        sentences.append(model_inputs)
        labels.append(labels_series)
    return sentences,labels
        

In [30]:
from train import handle_raw_data
train_data = handle_raw_data(raw_path="./data/dev.json",csv_path="./data/dev.csv")
train_data_dataset = NerDataset(train_data)


  4%|▎         | 186/5000 [00:00<00:07, 610.97it/s]

  6%|▌         | 309/5000 [00:00<00:07, 603.80it/s]

511 511


  9%|▊         | 431/5000 [00:00<00:07, 603.42it/s]

511 511


 15%|█▍        | 735/5000 [00:01<00:07, 596.32it/s]

511 511


 30%|███       | 1508/5000 [00:02<00:06, 581.94it/s]

511 511
511 511


 32%|███▎      | 1625/5000 [00:02<00:05, 577.65it/s]

511 511


 37%|███▋      | 1857/5000 [00:03<00:05, 568.92it/s]

511 511
511 511


 44%|████▍     | 2203/5000 [00:03<00:04, 569.32it/s]

511 511


 68%|██████▊   | 3380/5000 [00:05<00:03, 537.95it/s]

511 511


 75%|███████▌  | 3758/5000 [00:06<00:02, 531.31it/s]

511 511
511 511


 83%|████████▎ | 4130/5000 [00:07<00:01, 521.91it/s]

511 511
511 511


100%|██████████| 5000/5000 [00:09<00:00, 555.16it/s]


In [31]:
dataloader = DataLoader(
    dataset=train_data_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=my_coffate_fn
)

In [37]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
total_loss = 0
for k1,(k2,k3 )in enumerate(dataloader):
    input_ids = torch.tensor([k2[0][0][0],k2[1][0][0]],dtype=torch.int64)
    attention_mask = torch.tensor([k2[0][1][0],k2[1][1][0]],dtype=torch.uint8)
    print("input_ids",input_ids.shape)
    print("attention_mask",attention_mask.shape)
    outputs = model(input_ids,attention_mask) 

    labels = torch.Tensor(k3[0][1],k3[1][1])
    labels.view(-1)
    labels = labels.long()
    loss =criterion(outputs,labels)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

input_ids torch.Size([2, 512])
attention_mask torch.Size([2, 512])


ValueError: Expected input batch_size (1024) to match target batch_size (27).