In [1]:
import pandas as pd

## 加载入tokenizer

In [2]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/home/xiaoguzai/.local/lib/python3.9/site-packages/transformers")

input_dir = Path("/home/xiaoguzai/程序/NBME-Score Clinical Patient Notes/代码")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

#print('convert_file = ')
#print(convert_file)
#print('conversion_path = ')
#print(conversion_path)
shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path 

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)
    
from transformers import AutoTokenizer,AutoModel,AutoConfig
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
import transformers
tokenizer = DebertaV2TokenizerFast.from_pretrained('/home/xiaoguzai/模型/deberta-v3-large')


## 加载模型

In [18]:
max_len = 354
#下面prepare_input的时候将text和feature_text合在一起很巧妙

def prepare_input(text, feature_text):
    #inputs1 = tokenizer.encode_plus(text,\
    #                               add_special_tokens=True,\
    #                               max_length = max_len,\
    #                               paddin)
    inputs = {}
    inputs1 = tokenizer.encode_plus(text,\
                                   add_special_tokens=True,\
                                   max_length = max_len,\
                                   return_offsets_mapping = False)
    inputs2 = tokenizer.encode_plus(feature_text,\
                                   add_special_tokens=True,\
                                   max_length = max_len,\
                                   return_offsets_mapping = False)  
    
    if len(inputs1['input_ids'])+len(inputs2['input_ids'])-2 > max_len:
        exceed_length = len(inputs1['input_ids'])+len(inputs2['input_ids'])-max_len-2
        inputs['input_ids'] = inputs1['input_ids'][:len(inputs1['input_ids'])-exceed_length-1]+inputs2['input_ids'][1:]
        inputs['attention_mask'] = [1]*max_len
    else:
        inputs['input_ids'] = inputs1['input_ids'][:-1]+inputs2['input_ids'][1:-1]
        inputs['input_ids'] = inputs['input_ids']+[tokenizer.sep_token_id]
        #inputs['input_ids'] = inputs['input_ids']+[tokenizer.sep_token_id]+[tokenizer.pad_token_id]*(max_len-len(inputs['input_ids'])-1)
        inputs['attention_mask'] = [1]*(len(inputs['input_ids']))
        inputs['attention_mask'] = inputs['attention_mask']+[0]*(max_len-len(inputs['input_ids']))
        inputs['input_ids'] = inputs['input_ids']+[tokenizer.pad_token_id]*(max_len-len(inputs['input_ids']))
        inputs['token_type_ids'] = [0]*max_len
        #inputs['attention_mask'] = [1]*max_len
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v,dtype=torch.long)
    return inputs,len(inputs['input_ids'])

In [4]:
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
tokenizer = DebertaV2TokenizerFast.from_pretrained('/home/xiaoguzai/模型/deberta-v3-large')

In [5]:
def change_location_to_offset(text,location_list):
    results = np.zeros(len(text))
    #以char为级别计算，应该对整个text计算len
    for idx, offset_mapping in enumerate(location_list):
        try:
            start = (int)(offset_mapping[0])
            end = (int)(offset_mapping[1])
            results[start:end] = 1
        except:
            continue
    return results

def create_label(text):
    encoded = tokenizer.encode_plus(text,\
                                add_special_tokens=True,\
                                max_length = max_len,\
                                padding = "max_length",\
                                return_offsets_mapping = True)
    offset_mapping = encoded['offset_mapping']
    return offset_mapping


In [6]:
from torch.utils.data import DataLoader,Dataset
import itertools
class TestDataset(Dataset):
    def __init__(self,ids,text,input_ids,offset,token_type_ids,attention_mask):
        self.input_ids = input_ids
        self.tensors = [ids,\
                        text,\
                        torch.tensor(input_ids,dtype=torch.long),
                        torch.tensor(offset),\
                        torch.tensor(token_type_ids,dtype=torch.long),\
                        torch.tensor(attention_mask,dtype=torch.long)]
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,index):
        return tuple(tensor[index] for tensor in self.tensors)

In [7]:

import torch
import torch.nn as nn
import math
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class ClassificationModel(nn.Module):
    def __init__(self,model):
        super(ClassificationModel,self).__init__()
        self.model = model
        #self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(1024,1)
        
    def forward(self,input_ids,token_type_ids,attention_mask):
        outputs = self.model(input_ids=input_ids,\
                           token_type_ids=token_type_ids,\
                           attention_mask=attention_mask)
        outputs = outputs[0]
        #outputs = self.dropout(outputs)
        outputs = self.fc1(outputs)
        return outputs



In [8]:


def get_char_probs(total_text,offsets,predictions):
    results = [np.zeros(len(t)) for t in total_text]
    #!!!results 长短不一!!!
    #以char为级别计算，应该对整个text计算len
    torch.set_printoptions(threshold=np.inf)
    for i, (offset, prediction) in enumerate(zip(offsets, predictions)):
        for idx, (offset_mapping, pred) in enumerate(zip(offset, prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            #results[i][start:end] = ((float)(pred[0].item(),)
            results[i][start:end] = pred[0].item()
    return results

def get_results(test_text,char_probs, th=0.5):
    results = []
    #for char_prob in char_probs:
    for index in range(len(char_probs)):
        char_prob = char_probs[index]
        char_text = test_text[index]
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        #result = [f"{min(r)} {max(r)}" for r in result]
        result = [[min(r),max(r)] for r in result]
        
        for index1 in range(len(result)):
            if result[index1][0]-1 >= 0 and char_text[result[index1][0]-1] != ' ':
                result[index1][0] = result[index1][0]-1
                #preds[index][index1][0] = preds[index][index1][0]-1
            #if preds[index][index1][1]+1 < len(current_text) and current_text[preds[index][index1][1]+1] != ' ':
            #    preds[index][index1][1] = preds[index][index1][1]+1
        
        result = [str(r[0])+' '+str(r[1]) for r in result]
        result = ";".join(result)
        results.append(result)
    return results

def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions



In [9]:
model = torch.load('/home/xiaoguzai/程序/NBME-Score Clinical Patient Notes/deberta-v3模型文件/deberta_capatalize_noid_best_point=0.887656_fold=0.pth')

## 按照30%的比例增加数据

In [10]:
train = pd.read_csv('/home/xiaoguzai/数据/NBME-Score Clinical Patient Notes/train.csv')
features = pd.read_csv('/home/xiaoguzai/数据/NBME-Score Clinical Patient Notes/features.csv')
patient_notes = pd.read_csv('/home/xiaoguzai/数据/NBME-Score Clinical Patient Notes/patient_notes.csv')
train = train.merge(features, on=['feature_num','case_num'],how='left')
train = train.merge(patient_notes, on=['pn_num','case_num'],how='left')
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],['203 217'],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [11]:
features.head()

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


In [12]:
values = train.pn_num.values

## 增加0标签部分内容
id:00016~02436
長度:0～1301
(1301-0)/12 = 108
108/3 = 36
间隔id:2436/36=68

In [13]:
features.head()

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


In [14]:
patient_notes.head()

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [21]:
#pseudo_labeling = pd.read_csv('Pseudo-Labelling.csv')
import numpy as np
pseudo_labeling = pd.DataFrame()
#add 30 every time
for index in range(36):
    #总的次数
    data1 = index*68
    
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        print('========s = ========')
        print(s)
        print('====================')
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                new_data1 = str(data1)
                r1 = new_data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

{'id': '00001_000', 'case_num': 0, 'pn_num': 1, 'feature_num': 0}
{'id': '00069_000', 'case_num': 0, 'pn_num': 69, 'feature_num': 0}
{'id': '00137_000', 'case_num': 0, 'pn_num': 137, 'feature_num': 0}
{'id': '00205_000', 'case_num': 0, 'pn_num': 205, 'feature_num': 0}
{'id': '00273_000', 'case_num': 0, 'pn_num': 273, 'feature_num': 0}
{'id': '00341_000', 'case_num': 0, 'pn_num': 341, 'feature_num': 0}
{'id': '00409_000', 'case_num': 0, 'pn_num': 409, 'feature_num': 0}
{'id': '00477_000', 'case_num': 0, 'pn_num': 477, 'feature_num': 0}
{'id': '00478_000', 'case_num': 0, 'pn_num': 478, 'feature_num': 0}
{'id': '00545_000', 'case_num': 0, 'pn_num': 545, 'feature_num': 0}
{'id': '00613_000', 'case_num': 0, 'pn_num': 613, 'feature_num': 0}
{'id': '00681_000', 'case_num': 0, 'pn_num': 681, 'feature_num': 0}
{'id': '00749_000', 'case_num': 0, 'pn_num': 749, 'feature_num': 0}
{'id': '00818_000', 'case_num': 0, 'pn_num': 818, 'feature_num': 0}
{'id': '00819_000', 'case_num': 0, 'pn_num': 819, '

In [22]:
prin

NameError: name 'prin' is not defined

In [None]:
pseudo_labeling

## 增加1标签部分内容
id:10004~10988
长度:1302～2601
总的次数(2601-1302)/36=36
间隔id:(10988-10004)/36=944/36=26

In [24]:
total_index = 36
index = 1
#for index in range(36):
#for data1 in range(95217):
data1 = 0
for index in range(36):
    data1 = 10030+index*26
        
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        print('======s = ======')
        print(s)
        print('================')
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            print('ids = ')
            print(ids)
            print('text = ')
            print(text)
            print('feature_text = ')
            print(feature_text)
            inputs,length = prepare_input(text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(100,113):
            #每次增加的id
                new_data1 = str(data1)
                r1 = new_data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":1,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

{'id': '10031_000', 'case_num': 1, 'pn_num': 10031, 'feature_num': 0}
ids = 
10031_000
text = 
HPI: 20 yo F c/o suddent RLQ abdominal pain, dull crampy pain, PS 5/10, took ibuprofen but it didn't help, worsened when walking around. she has been having diarrhea for a couple of days: watery, brown stool. He appetite decreased. No fever, no urinary/bowel movement problems, no falls. No abnormal vaginal discharge/bleeding. 

ROS: negative exxept above
PMH: none
PSH:none
meds: ibuprofen
allergies: NKDA
menstruation: LMP last 2 weeks, regular every month, no change in flow
SH: no smoking, no EtOH, no recreational drug use, last SI 9 months ago with her boyfriend; use condom everytime, no STD history
feature_text = 
nan


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

## 增加2标签部分内容(20001～22100)
id:20001~22123
長度2602～4301
总的次数:(4301-2602)/36=47
间隔id:(22123-20001)/47=45

In [None]:
for index in range(47):
    data1 = 20010+index*45
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(ids,text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                data1 = str(data1)
                r1 = data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

## 增加标签3部分内容
id:30037~39921
長度4302~5901
总的次数(5901-4302)/36=44
间隔id(39921-30037)/44 = 224

In [None]:
for index in range(30):
    data1 = 30300+index*224
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(ids,text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                data1 = str(data1)
                r1 = data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

## 增加标签4部分内容
id:40045~45947
長度5902~6901
总的次数:(6901-5902)/36=28
间隔id:(45947-40045)/28=211

In [None]:
for index in range(28):
    data1 = 40080+index*211
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(ids,text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                data1 = str(data1)
                r1 = data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

## 增加标签5部分内容
id:50072~57026
長度6902～8701
总的次数:(8702-6902)/36=50
间隔id:(57026-50072)/50 = 140

In [None]:
for index in range(50):
    data1 = 50140+index*140
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(ids,text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                data1 = str(data1)
                r1 = data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

## 增加标签6部分内容
id:60004~61768
长度:8702~9901
总的次数:(9901-8702)/36=33
间隔id:(61768-60004)/33=53

In [None]:
for index in range(33):
    data1 = 60053+index*53
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(ids,text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                data1 = str(data1)
                r1 = data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

## 增加标签7部分内容
id:70087~74087
长度9902~10801
总的次数:(10801-9902)/36=25
间隔id:(74087-70087)/25=160

In [None]:
for index in range(25):
    data1 = 70160+index*160
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(ids,text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                data1 = str(data1)
                r1 = data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

## 增加标签8部分内容
id:80039~84366
长度:10802~12601
总的次数:(12601-10802)/36=50
间隔id:(84366-80039)/50=87

In [None]:
for index in range(50):
    data1 = 80087+index*87
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(ids,text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                data1 = str(data1)
                r1 = data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

## 增加标签9部分内容
id:90127~95333
长度:12602~14301
总的次数(14301-12602)/36=47
间隔id(95333-90127)/47=111

In [None]:
for index in range(47):
    data1 = 90111+index*111
    #******判斷是否能預測出內容
    while 1:
        flag = False
        final_result = []
        data1 = data1+1
        while data1 in values:
            data1 = data1+1
        #for index1 in range(13):
        #每次增加的id
        index1 = 0
        new_data1 = str(data1)
        r1 = new_data1.rjust(5,'0')
        new_data2 = str(index1)
        r2 = new_data2.rjust(3,'0')
        s = {"id":r1+'_'+r2,\
             "case_num":0,\
             "pn_num":int(r1),\
             "feature_num":int(r2)}
        #只判断0位置
        current_data = pd.DataFrame(s,index=[0])
        current_data = current_data.merge(features,on=['feature_num','case_num'],how='left')
        current_data = current_data.merge(patient_notes,on=['pn_num','case_num'],how='left')
        test_ids = []
        test_text,test_input_ids,test_token_type_ids = [],[],[]
        test_attention_mask,test_offset,test_label = [],[],[]
        for  index,data  in  current_data.iterrows():
            #!!!数据这里出现bug，读取的都是一组数据!!!
            ids = data['id']
            text = data['pn_history']
            feature_text = data['feature_text']
            if type(text) != str and math.isnan(text):
                break
            inputs,length = prepare_input(ids,text,feature_text)
            current_offset = create_label(text)
            test_ids.append(ids)
            test_text.append(text)
            test_input_ids.append(inputs['input_ids'].tolist())
            test_token_type_ids.append(inputs['token_type_ids'].tolist())
            test_attention_mask.append(inputs['attention_mask'].tolist())
            test_offset.append(current_offset)
        
        if len(test_text) == 0:
            continue

        test_dataset = TestDataset(test_ids,\
                   test_text,\
                   test_input_ids,\
                   test_offset,\
                   test_token_type_ids,\
                   test_attention_mask)
        test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False)

        current_result = []
        for batch_ids,batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_token_type_ids = batch_token_type_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            with torch.no_grad():
                logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                              attention_mask=batch_attention_mask)
                logit = torch.sigmoid(logit)
                preds = get_char_probs(batch_text,batch_offset,\
                                       logit.cpu())
                for data in preds:
                    current_result.append(data)
                current_result = get_results(batch_text,current_result)
        final_result.append(current_result)
        #results = get_results(test_text,final_result)
        for data in final_result:
            if len(data) != 0:
                flag = True
        if flag == True:
            for index1 in range(13):
            #每次增加的id
                data1 = str(data1)
                r1 = data1.rjust(5,'0')
                data2 = str(index1)
                r2 = data2.rjust(3,'0')
                s = {"id":r1+'_'+r2,\
                     "case_num":0,\
                     "pn_num":int(r1),\
                     "feature_num":int(r2)}
                pseudo_labeling = pseudo_labeling.append(s,ignore_index=True)
            break

In [None]:
pseudo_labeling.head()

In [None]:
pseudo_labeling.to_csv('/home/xiaoguzai/数据/NBME-Score Clinical Patient Notes/pseudo_labeling.csv',index=False)