# Named Entity Recognition with BERT in PyTorch
Based: https://towardsdatascience.com/named-entity-recognition-with-bert-in-pytorch-a454405e0b6a  
library: Hugging face's "Transformer"   
数据库存在脏数据！label和词长度不一致  
We use "BertFortokenClassification" instead of "BertForSequenceClassification"

In [16]:
# Mode chosing
Colab=False
Remote_server=True

In [17]:
#######For google
if Colab:
    !pip install transformers
if Remote_server:
    Work_path='/workspace/Bert_Chinese/'
else:
    Work_path='./'

In [18]:
import pandas as pd
from transformers import BertTokenizerFast
import torch
import numpy as np
from transformers import BertForTokenClassification
from torch.utils.data import DataLoader    
#if we use import torch.utils.data.DataLoader as Dataloader, here dataloader is a module
#but here, from torch,utils.data import dataloader is now a function
import torch.optim as optim   #Here optim is still a module, we always use optim.SGD to create a function SGD
from tqdm import tqdm   #same reason, if we "import tqdm" directly, will be error: module is not callable
from torch.utils.tensorboard import SummaryWriter
from torchcrf import CRF
import copy

In [19]:
########Load from gdrive
if Colab:
    from google.colab import drive
    drive.mount('/content/drive')
    df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Input/People_dalily_10000_examples_adapted.csv")


####load(local) 
else:
    #df=pd.read_csv('/workspace/Bert_Chinese/People_dalily_10000_examples_adapted.csv')
    df=pd.read_csv(Work_path+'People_dalily_10000_examples_adapted.csv')

In [20]:
#df=pd.read_csv('ner.csv')   #Not completed, like "Demonstrators" not regard as ent
df.head(5)

Unnamed: 0,text,labels
0,"人 民 网 1 月 1 日 讯 据 《 纽 约 时 报 》 报 道 , 美 国 华 尔 街 ...",O O O B_T I_T I_T I_T O O O B_LOC I_LOC O O O ...
1,"《 纽 约 时 报 》 报 道 说 , 标 普 5 0 0 指 数 今 年 上 升 2 9 ...",O B_LOC I_LOC O O O O O O O O O O O O O O B_T ...
2,"就 1 2 月 3 1 日 来 说 , 由 于 就 业 前 景 看 好 和 经 济 增 长 ...",O B_T I_T I_T I_T I_T I_T O O O O O O O O O O ...
3,"另 据 《 华 尔 街 日 报 》 报 道 , 2 0 1 3 年 是 1 9 9 5 年 ...",O O O B_LOC I_LOC I_LOC O O O O O O B_T I_T I_...
4,人 民 网 平 壤 1 月 1 日 电 ( 记 者 王 莉 、 程 维 丹 ) 朝 鲜 最 ...,O O O B_LOC I_LOC I_T I_T I_T I_T O O O O B_PE...


## Give ids to labels

In [21]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['labels'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]   #here .add is fun without return value, thus if we set a=[...], 
                                                                  # a will =[]

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
Max_len=512
print(ids_to_labels)

{0: 'B_LOC', 1: 'B_ORG', 2: 'B_PER', 3: 'B_T', 4: 'I_LOC', 5: 'I_ORG', 6: 'I_PER', 7: 'I_T', 8: 'O'}


## Tokenize(Output directly become tensor)
Bert tokenizer can transform a sentence in nl to a list of number

In [22]:
# Let's take a look at how can we preprocess the text - Take first example
text = df['text'].values.tolist()
example = text[36]   #get one sentence (THE 36th sentence)

#tokenize by bert
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
text_tokenized = tokenizer(example, padding='max_length', max_length=Max_len, truncation=True, return_tensors="pt")

#3 layer can be found: input_ids, token_type_ids,atention_mask
#print(text_tokenized)

### decode

In [23]:
#print(tokenizer.decode(text_tokenized.input_ids[0][0:50]))    #0 is the first sentence,since here we have only one sentence to tokenize
#we can't decode more than 1 sentence with .decode

### "Convert ids to token" will find Bert's"subword" problem
The BERT tokenizer uses the so-called word-piece tokenizer under the hood, which is a sub-word tokenizer. This means that BERT tokenizer will likely to **split one word into one or more meaningful sub-words**.
*Which make label provided by database can't match tokens*

In [24]:
######Check output of word_ids, we will find it pretty smart:
######First token [CLS] is marked as None, which avoid the movement of whole sentence
######All words are marked with their real "index" in "labels list"(0st word is Prime, and 3rd word is G+ei+r)
word_ids = text_tokenized.word_ids()
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0])[0:50])
print(word_ids[0:50])  

['[CLS]', '新', '年', '前', '夕', ',', '国', '家', '主', '席', '习', '近', '平', '通', '过', '中', '国', '国', '际', '广', '播', '电', '台', '、', '中', '央', '人', '民', '广', '播', '电', '台', '、', '中', '央', '电', '视', '台', ',', '发', '表', '了', '2', '0', '1', '4', '年', '新', '年', '贺']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48]


### Two method can be applied to solve this problem:
These word_ids will be very useful to adjust the length of the label by applying either of these two methods:  

1, We only provide a label to the first sub-word of each splitted token. The continuation of the sub-word then will simply have ‘-100’ as a label. All tokens that don’t have word_ids will also be labeled with ‘-100’.  
2, We provide the same label among all of the sub-words that belong to the same token. All tokens that don’t have word_ids will be labeled with ‘-100’.

In [25]:
def align_label_example(tokenized_input, labels,labels_to_ids,label_all_tokens):
    '''
    output: labels_ids
    
    '''
    word_ids = tokenized_input.word_ids()

    previous_word_idx = None
    label_ids = []
    #print('word')
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
            
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])  #As we mention, word_idx is exactly 
            except:
                label_ids.append(-100)   #此处是避免脏数据的影响（即label的长度和实际的句子长度不同）
    
        else:
            # print('label_id', label_ids)
            # print('word_ids ',word_ids[0:50])
            # #print('label[word]', labels[word_idx])
            # print('label list ', len(labels))
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)


        previous_word_idx = word_idx
    

    return label_ids

## New lables for bert to train
Since original labels can't match with its token list, we create a new lables list to fit it  
what is more, we present labels in ids form

In [26]:
label = labels[36]

#If we set label_all_tokens to True.....
label_all_tokens = True

new_label = align_label_example(text_tokenized, label,labels_to_ids,label_all_tokens)
print(new_label[0:25])   #he
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0])[0:25])

[-100, 3, 7, 8, 8, 8, 8, 8, 8, 8, 2, 6, 6, 8, 8, 0, 4, 8, 8, 8, 8, 8, 8, 8, 8]
['[CLS]', '新', '年', '前', '夕', ',', '国', '家', '主', '席', '习', '近', '平', '通', '过', '中', '国', '国', '际', '广', '播', '电', '台', '、', '中']


## Dataset Class(tokenize include)
Before we train our BERT model for NER task, we need to create a dataset class to generate and fetch data in a batch.

In [27]:
##### NOt a simple class of dataset, we also realise tokenizer here

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df,labels_to_ids,label_all_tokens):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()   #list of sentence
        
        text_tokenized = [tokenizer(str(i),
                               padding='max_length', max_length = Max_len, truncation=True, return_tensors="pt") for i in txt]
        
        self.texts=text_tokenized
         
        self.labels = [align_label_example(i,j,labels_to_ids,label_all_tokens) for i,j in zip(text_tokenized, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels   

### Train val split

In [37]:
df = df[0:1000]    #we pick only 1000 example
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),    
                            [int(.8 * len(df)), int(.9 * len(df))])    #split the lisy into 3 parts, with 2 cut
                                                                        #one cut at 0.8*len(df)
                                                                        #one cut at 0.9*len(df)
                                                                        #so train:val:test=8:1:1

#### Test Datasequence (check with class "Datasequence"'s return)
Find that output of DataSequence is a list with len of nb_sentence,  and each elements is a tuple  

elem 1 in truple : Dict with 3 pairs key-value: input_ids, attention mask, token_type_ids   
elem 2 in truple: labels's ids

In [38]:
# Data_token=DataSequence(df_train[43:100],labels_to_ids,label_all_tokens)
# Data_token.__getitem__(20)[1]

### Torch's model definition
Define a model class in torch's way

In [39]:
len(unique_labels)

9

In [40]:
class BertModel(torch.nn.Module):

    def __init__(self,crf=False,Unknow_label=len(unique_labels)-1,device_used='cuda'):       #here we set defaut value for these function,
                                            #and we can change them if we want, just by calling: model.crf
                                            #Now you know, init means initialize!!, we do it only when we call model=BertModel

        super(BertModel, self).__init__()   #for pytorch, this lign is obligatory

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))  #transformer layer
        self.crf_layer = CRF(num_tags=len(unique_labels),batch_first=True)    #here since our tensor is batch_size*sqe_len, so here is batch_first
        self.crf=crf
        self.unknow_label=Unknow_label
        self.device_used=device_used
        self.max_len=Max_len
        
    
    def forward(self, input_id, mask,label):   #Forward is a special function, we can pass like model(input), and we get the return
        if not self.crf:
            output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)  
                                                                                        #self.bert equal to model
                                                                                        #return_dict=false -> return value is a tuple of (loss, logits)
        else:
            loss,logits=self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False) 
            
            ###adaptation to fit crf
            mask_adapted=self.mask_adapted_first_end(mask)
            label_adapted=self.filter_label(label)
            logits_adapted=self.filter_logits_first(logits)
            
            ### 
            loss=self.crf_layer(logits_adapted,label_adapted,mask_adapted)
            pre=self.crf_layer.decode(logits_adapted)
            output=(loss,pre)
        
        return output
    
    def mask_adapted_first_end(self,mask):
        """
        To adapt the output of bert to input of crf:

        Here we delete the last two '1' in the end
        1 is for the disappear of first first token 
        and 1 is for imforming the ML that the last one is not important
        """
        mask_squ=copy.deepcopy(mask)
        label_batch_filted=[]
        for label_win in mask_squ:
            #print('before',(label_win!=0).sum())
            len_one=(label_win!=0).sum()
            label_win[len_one-2:len_one]=0
            #print('after',(label_win!=0).sum())
            label_batch_filted.append(label_win)
            mask_filted=torch.stack(label_batch_filted)
        return mask_filted[:,1:self.max_len].type(torch.bool)
    
    def filter_label(self,label):
        """
        To adapt the output of bert to input of crf:
        1, cut off the first label of each sentence, which is always -100
        2, replace all -100 by 8, which will be ignored thanks to mask, (we have to do it or else the program cann't go on)
        """
        Unknow_label=self.unknow_label
        device=self.device_used
        delete_first_col_label=label[:,1:self.max_len]
        label_change_value=torch.where(delete_first_col_label==-100,torch.tensor(Unknow_label).to(device),delete_first_col_label)  
        return label_change_value
    
    def filter_logits_first(self,logits):
        """
        To adapt the output of bert to input of crf:
        1, We delete the prediction of first token in each sentence,since we do so in filter_label
        """
        #return torch.index_select(logits,1,(1+torch.IntTensor(range(511))).long().to(self.device_used))
        return logits[:,1:self.max_len]

### About the warning of BertForTokenClassifica
You may occur such warning:   
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification:....   
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).   
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).   
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly  initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.   

Answer:  
https://github.com/huggingface/transformers/issues/5421#issuecomment-652582854  

@ohmeow you're loading the **bert-base-cased checkpoint** (which is a checkpoint that was trained using a similar architecture to BertForPreTraining) in a BertForSequenceClassification model.

This means that:

The layers that BertForPreTraining has, but BertForSequenceClassification does not have will be discarded  
The layers that BertForSequenceClassification has but BertForPreTraining does not have will be randomly initialized.  
This is expected, and tells you that you won't have good performance with your BertForSequenceClassification model before you   fine-tune it 🙂.

In [41]:
def acc_calculation_crf(pre,train_label,model):
    unknow_label=model.unknow_label
    device=model.device_used
    pre=torch.tensor(pre).to(device)

    acc_batch=[]
    nb_valid_label=0

    train_label=train_label[:,1:model.max_len]
    pre_clean = [pre[i,(train_label[i]!=-100)&(train_label[i]!=unknow_label)] for i in range(len(train_label))]
    label_clean = [train_label[k,(train_label[k]!=-100)&(train_label[k]!=unknow_label)] for k in range(len(train_label))]

    for i in range(len(pre_clean)):
        if pre_clean[i].shape[0]!=0:
            acc_batch.append((pre_clean[i]==label_clean[i]).float().mean())
            nb_valid_label+=1
    acc=torch.tensor(acc_batch).sum()
    return acc,nb_valid_label

In [42]:
def acc_calculation(logits,train_label,unknow_label):
    acc_batch=[]
    nb_valid_label=0
    logits_clean = [logits[i,(train_label[i]!=-100)&(train_label[i]!=unknow_label)] for i in range(len(train_label))]   #size_batch*len_sentence_without_-100
    label_clean = [train_label[k,(train_label[k]!=-100)&(train_label[k]!=unknow_label)] for k in range(len(train_label))]   #size_batch     

    ###calculate prediction and accuracy
    prediction=[]
    for i in range(len(logits_clean)):
        if logits_clean[i].shape[0]!=0:
            prediction.append(logits_clean[i].argmax(dim=1))
        else:
            prediction.append(torch.tensor([]))
    for i in range(len(logits_clean)):
        if prediction[i].shape[0]!=0:
            acc_batch.append((prediction[i]==label_clean[i]).float().mean())
            nb_valid_label+=1
    acc=torch.tensor(acc_batch).sum()
    return acc,nb_valid_label


### Training

In [43]:
def train_loop(model, df_train, df_val,optimizer,EPOCHS,accumulation_steps,permit_decrease,Unknow_label,writer,writer_epochs):

    #Dataloading
    train_dataset = DataSequence(df_train,labels_to_ids,label_all_tokens) #output a tuple: (dict of input, list of label)
    val_dataset = DataSequence(df_val,labels_to_ids,label_all_tokens)
    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)   #all in one batch
    val_dataloader = DataLoader(val_dataset, batch_size=4)
    
    
    #GPU / CPU
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    #device='cpu'
    
    #Presetiing
    model=model.to(device)
    model.device_used=device
    best_val_acc=0
    nb_decreasing_acc=0
    
    list_loss_train=[]
    list_acc_val=[]

    for epoch_num in range(EPOCHS):
        total_nb_valide_example_train=0
        total_nb_valide_example_val=0
        total_acc_train = 0
        total_loss_train = 0
        step=0

        ######Training######
        model.train()

        for train_data, train_label in tqdm(train_dataloader): 
            step=step+1
            train_label = train_label.to(device)   #get label
            mask = train_data['attention_mask'].to(device)   #get attention mask
            input_id = train_data['input_ids'].to(device)   
            
            #####Sqe useless dim
            input_id_sqe=torch.squeeze(input_id)
            mask_sqe=torch.squeeze(mask)

            #####Forward
            if model.crf:
                loss,pre=model(input_id_sqe, mask_sqe, train_label)
                acc,nb_valid_example=acc_calculation_crf(pre,train_label,model)
            else:
                loss, logits = model(input_id_sqe, mask_sqe, train_label)   #3 input to model(see class BertModel for details)
                                                                    #loss is obvious the loss function
                                                                    #logit is the "raw output" of the model(quite nornal in classification model)
                                                                    #0<logits<1, for multi-classification task, it offen pass through a softmax,
                                                                    #then we get probability of  each class
                acc,nb_valid_example=acc_calculation(logits,train_label,unknow_label=Unknow_label)
            
            ####For calculate acc, should clean off meanless data: -100
            total_nb_valide_example_train+=nb_valid_example

            total_acc_train += acc
            total_loss_train += loss.item()

            #####With accu to save GPU
            
            loss = loss/accumulation_steps
            loss.backward()

            if((step+1)%accumulation_steps)==0:
              optimizer.step()        # 反向传播，更新网络参数
              optimizer.zero_grad()   # 清空梯度
        if(step==1):
          break
            

        ########evaluation######

        model.eval()
        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in tqdm(val_dataloader):
            val_label = val_label.to(device)
            mask = val_data['attention_mask'].to(device)
            input_id = val_data['input_ids'].to(device)
            
            #####Sqe useless dim
            input_id_sqe=torch.squeeze(input_id)
            mask_sqe=torch.squeeze(mask)

            ##Forward
            loss, logits = model(input_id_sqe, mask_sqe, train_label)

            acc,nb_valid_example=acc_calculation(logits,val_label,Unknow_label)
            total_nb_valide_example_val+=nb_valid_example

            total_acc_val += acc
            total_loss_val += loss.item()

        #####Early stop#####
        if best_val_acc<(total_acc_val / step):
            best_val_acc=total_acc_val / step
            nb_decreasing_acc=0
            torch.save(model.state_dict(), Work_path+'Model_backup/Model_backup.pt')
        else:
          nb_decreasing_acc+=1
        if nb_decreasing_acc==permit_decrease:
          print('\n\n Overall fitting avoiding! ')
          break

        
        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / total_nb_valide_example_train: .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / total_nb_valide_example_val: .3f}')
#         writer.add_scalar("loss_train", total_loss_train / len(df_train), epoch_num+writer_epochs)
#         writer.add_scalar("acc_train", total_acc_train / total_nb_valide_example_train, epoch_num+writer_epochs)
#         writer.add_scalar("loss_val", total_loss_val / len(df_val), epoch_num+writer_epochs)
#         writer.add_scalar("acc_val",total_acc_val / total_nb_valide_example_val,epoch_num+writer_epochs)
        
    return epoch_num 

#### Start training

In [44]:
#####Model path
if Colab:
  model_save_name = 'Bert_chinese_formal.pt'
  Path=F"/content/drive/MyDrive/Colab Notebooks/Output/{model_save_name}"
else:
  Path=Work_path+'Model_backup/Model_backup.pt'

####################################
#####Setting before training######
####################################
Start_new_training=True
Load_from_driver=False
Keep_training=True

#####Model loading
if Start_new_training:
    torch.cuda.empty_cache()
    model=BertModel()
    writer=SummaryWriter(log_dir=Work_path+'runs')
    writer_epochs=0
    


if Load_from_driver:
    torch.cuda.empty_cache()
    model=BertModel()
    model.load_state_dict(torch.load(Path))


##########################################
#################Dataloading##############
##########################################
df = df[0:1000]    #we pick only 1000 example
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8 * len(df)), int(.9 * len(df))])   



##############Parameter setting##########
LEARNING_RATE = 0.5e-2   
EPOCHS = 3
accumulation_steps=8    #with accumutlation_step bigger than 1, we can save the usage of GPU storage
LEARNING_RATE=LEARNING_RATE*accumulation_steps  #Lr should be increased, or else the training will be too slow
permit_decrease=5
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
Unknow_label=8
model.crf=True
#############Start training###########
if Keep_training:
    epochs_trained=train_loop(model, df_train, df_val,optimizer=optimizer,EPOCHS=EPOCHS,accumulation_steps=accumulation_steps,permit_decrease=permit_decrease,Unknow_label=Unknow_label,writer=writer,writer_epochs=writer_epochs)
    writer_epochs+=epochs_trained

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

KeyboardInterrupt: 

## Model saving

In [24]:
####Model sava to input 
#torch.save(model.state_dict(), Path)
#writer.close()

### Evaluate

In [35]:
def evaluate(model, df_test):
    test_dataset = DataSequence(df_test,labels_to_ids,label_all_tokens)
    test_dataloader = DataLoader(test_dataset, batch_size=4)

    #for local or cloud
    use_cuda = True
    device = torch.device("cuda" if use_cuda else "cpu")
    
    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0
    total_nb_valid_example=0

    for test_data, test_label in test_dataloader:
        test_label = test_label.to(device)
        mask = test_data['attention_mask'].to(device)
        input_id = test_data['input_ids'].to(device)
        
        input_id_sqe=torch.squeeze(input_id)
        mask_sqe=torch.squeeze(mask)
        
        loss, logits = model(input_id_sqe, mask_sqe, test_label)
        
        acc,nb_valid_example=acc_calculation(logits,test_label,Unknow_label)
              
        total_acc_test += acc
        total_nb_valid_example+=nb_valid_example

    val_accuracy = total_acc_test / total_nb_valid_example
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')


evaluate(model, df_test)

Test Accuracy:  0.119


In [92]:
device='cpu'

test_dataset = DataSequence(df_test,labels_to_ids,label_all_tokens)
test_dataloader = DataLoader(test_dataset, batch_size=4)


test_data, test_label=next(iter(test_dataloader))

mask = test_data['attention_mask'].to(device)
input_id = test_data['input_ids'].to(device)
        
input_id_sqe=torch.squeeze(input_id)
mask_sqe=torch.squeeze(mask)

loss, logits = model(input_id_sqe, mask_sqe, test_label)

In [51]:
Crf_layer

CRF(num_tags=10)

In [221]:
logits.shape

torch.Size([4, 512, 9])

In [29]:
# seq_length = 3  # maximum sequence length in a batch
# batch_size = 2  # number of samples in the batch
# emissions = torch.randn(seq_length, batch_size, num_tags)
# tags = torch.tensor([[0, 1], [2, 4], [3, 1]], dtype=torch.long)  # (seq_length, batch_size)

# mask = torch.tensor([[1, 0], [0, 1], [0, 0]], dtype=torch.uint8)
# model_crf(emissions, tags,mask)



In [87]:
def mask_adapted_first_end(mask):
    """
    To adapt the output of bert to input of crf:

    Here we delete the last two '1' in the end
    1 is for the disappear of first first token 
    and 1 is for imforming the ML that the last one is not important
     """
    mask_squ=copy.deepcopy(mask)
    label_batch_filted=[]
    for label_win in mask_squ:
        #print('before',(label_win!=0).sum())
        label_win[((label_win!=0).sum()-1)]=0
        label_win[((label_win!=0).sum()-2)]=0
        #print('after',(label_win!=0).sum())
        label_batch_filted.append(label_win)
        mask_filted=torch.stack(label_batch_filted)
        
    return torch.index_select(mask_filted,1,torch.IntTensor(range(511))).type(torch.bool)

mask_adapted_first_end(mask_sqe).shape

torch.Size([4, 511])

In [90]:
def filter_label(test_label):
    """
    To adapt the output of bert to input of crf:
    1, cut off the first label of each sentence, which is always -100
    2, replace all -100 by 8, which will be ignored thanks to mask, (we have to do it or else the program cann't go on)
    """
    delete_first_col_label=torch.index_select(test_label,1,1+torch.IntTensor(range(511)))    
    label_change_value=torch.where(delete_first_col_label==-100,torch.tensor(Unknow_label),delete_first_col_label)  
    return label_change_value

In [94]:
def filter_logits_first(logits):
    """
    To adapt the output of bert to input of crf:
    1, We delete the prediction of first token in each sentence,since we do so in filter_label
    """
    return torch.index_select(logits,1,1+torch.IntTensor(range(511)))

In [95]:
filter_label(test_label)

tensor([[8, 8, 8,  ..., 8, 8, 8],
        [8, 8, 8,  ..., 8, 8, 8],
        [2, 6, 6,  ..., 8, 8, 8],
        [8, 8, 8,  ..., 8, 8, 8]])

In [96]:
mask_adapted_first_end(mask_sqe)

tensor([[ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False]])

### 对mask不抱幻想，直接处理和label,把-100改成10

In [97]:
Crf_layer(filter_logits_first(logits),filter_label(test_label),mask_adapted_first_end(mask_sqe))

tensor(-71.0673, grad_fn=<SumBackward0>)

In [64]:
def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=Max_len, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = Max_len, truncation=True, return_tensors="pt")

    mask = text['attention_mask'][0].unsqueeze(0).to(device)

    input_id = text['input_ids'][0].unsqueeze(0).to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)
            
evaluate_one_text(model, '张三已经到辽宁沈阳了嗷，李卡指定在九月五日没有好果汁吃')

张三已经到辽宁沈阳了嗷，李卡指定在九月五日没有好果汁吃
['B_PER', 'I_PER', 'O', 'O', 'O', 'B_LOC', 'I_LOC', 'I_LOC', 'I_LOC', 'O', 'O', 'O', 'B_PER', 'I_PER', 'O', 'O', 'O', 'B_T', 'I_T', 'I_T', 'I_T', 'O', 'O', 'O', 'O', 'O', 'O']


In [65]:
evaluate_one_text(model, '杀马特团长，我到沈阳了，你和你徒弟呢')

杀马特团长，我到沈阳了，你和你徒弟呢
['O', 'I_PER', 'I_PER', 'O', 'O', 'O', 'O', 'O', 'B_LOC', 'I_LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
