In [None]:
from torch import nn
import torch
from transformers import BertTokenizerFast,BertModel,BertConfig
import torch.nn.functional as F
from torch.optim import Adam


class NerModel(nn.Module):
    bio_unique_labels =[
        'B-dis', 'B-sym', 'B-pro', 'B-equ', 'B-dru', 
        'B-ite', 'B-bod', 'B-dep', 'B-mic', 'I-dis', 
        'I-sym', 'I-pro', 'I-equ', 'I-dru', 'I-ite', 
        'I-bod', 'I-dep', 'I-mic', 'O-dis', 'O-sym', 
        'O-pro', 'O-equ', 'O-dru', 'O-ite', 'O-bod', 
        'O-dep', 'O-mic', 'E-dis', 'E-sym', 'E-pro', 
        'E-equ', 'E-dru', 'E-ite', 'E-bod', 'E-dep', 
        'E-mic', 'S-dis', 'S-sym', 'S-pro', 'S-equ', 
        'S-dru', 'S-ite', 'S-bod', 'S-dep', 'S-mic', 
        'O']
    def __init__(self,dropout= 0.1,predict=False):
        super(NerModel, self).__init__()
        self.__predict = predict
        self.__pretained_path = "./premodels/roberta-large-chinese/"
        self.__bert = BertModel.from_pretrained(self.__pretained_path)
        self.__bert_config = BertConfig.from_pretrained(self.__pretained_path)
        self.__hidden_size = self.__bert.config.hidden_size
        self.__dropout = nn.Dropout(dropout)
        self.__classifier = nn.Linear(in_features=self.__hidden_size,out_features=len(self.bio_unique_labels),bias=True)
        self.__softmax = nn.Softmax
        self.__torchmax = torch.max
        return
    
    def is_predict(self,predict:bool):
        self.__predict = predict
        return
        
    def forward(self,input_ids,attention_mask):
        with torch.no_grad():
            token_series = self.__bert(input_ids,attention_mask)[0]
        token_series = self.__dropout(token_series)
        token_series = self.__classifier(token_series)
        batch_size, seq_len, ner_class_num = token_series.shape
        logits = token_series.view(
            (batch_size * seq_len, ner_class_num))
        return token_series if self.__predict else logits
      

In [None]:
model = NerModel()
model.train()

In [None]:
batch_size = 32
num_epoch = 30  
learning_rate = 1e-5  

In [None]:
optimizer = Adam(model.parameters(), learning_rate) 
CE_loss = nn.CrossEntropyLoss()

In [None]:
#load train & val data
from train import handle_raw_data
from torch.utils.data import Dataset ,DataLoader
class NerDataset(Dataset):
    def __init__(self,pd):
        super().__init__()
        self.__dataset = pd
        self.__data_size = len(pd)        
    def __getitem__(self, index:int):
        return (self.__dataset.loc[index,"W"], self.__dataset.loc[index,"B"]) 
        
    def __len__(self):
        return self.__data_size


In [None]:
unique_labels = [
    "dis",
    "sym",
    "pro",
    "equ",
    "dru",
    "ite",
    "bod",
    "dep",
    "mic",
]

bio_unique_labels = []
bio_unique_labels.extend([ "B-"+l for l in unique_labels])
bio_unique_labels.extend([ "I-"+l for l in unique_labels])
bio_unique_labels.extend([ "O-"+l for l in unique_labels])
bio_unique_labels.extend([ "E-"+l for l in unique_labels])
bio_unique_labels.extend([ "S-"+l for l in unique_labels])
bio_unique_labels.append("O")

labels_to_ids = {k: v for v, k in enumerate(sorted(bio_unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(bio_unique_labels))}
print(labels_to_ids)

In [22]:
from transformers import BertTokenizer
import numpy as np
def my_coffate_fn(batch):
    sentences, labels = [] , []
    tokenizer = BertTokenizer.from_pretrained("./premodels/roberta-large-chinese/")
    for (s , l)  in batch:
        token_series = tokenizer.encode_plus(
            list(s),
            padding='max_length',
            max_length=512,
            truncation=True,
            return_tensors="pt",
            is_split_into_words=False
        )
        model_inputs = (token_series["input_ids"],token_series["attention_mask"])
        
        print(1111)
        l.insert(0,"O")
        l.extend(["O"]*(512-len(l)))
        if len(l) != 512:
            print(len(l))
            raise BaseException 
        labels_series = []
        labels_series.extend([labels_to_ids[i] for i in l ])
        if len(labels_series) != 512:
            raise BaseException 
        sentences.append(model_inputs)
        labels.append(labels_series)
    return sentences,labels
        

In [None]:
from train import handle_raw_data
train_data = handle_raw_data(raw_path="./data/dev.json",csv_path="./data/dev.csv")
train_data_dataset = NerDataset(train_data)


In [23]:
dataloader = DataLoader(
    dataset=train_data_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=my_coffate_fn
)

In [None]:
# from transformers import BertTokenizerFast
# tokenizer = BertTokenizer.from_pretrained("./premodels/bert-base-chinese/")
# text_tokenized =  tokenizer("我今天吃了饭,使用了nvidia-smi做transformer监控")
# print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"]))



In [24]:
for k1,(k2,k3 )in enumerate(dataloader):
    print("k2",k2)
    print("k3",k3)


k2 [(tensor([[ 101, 8020,  753, 8021, 4495, 7270, 1355, 5509, 7397, 4809, 4507,  754,
         3710, 1825, 7000,  696, 1927, 2471, 6629, 5852, 1075, 7397, 4809, 8024,
         2193, 5636, 4495, 7270, 6826, 5353, 1350, 3255, 5543, 7397, 4809,  511,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,   

BaseException: 