In [1]:
import torch
import numpy as np
import pandas as pd
import os
import transformers
import numpy as np
import random
import ast
import torch.nn as nn

In [2]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/home/xiaoguzai/.local/lib/python3.9/site-packages/transformers")

input_dir = Path("/home/xiaoguzai/程序/NBME-Score Clinical Patient Notes/代码")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

#print('convert_file = ')
#print(convert_file)
#print('conversion_path = ')
#print(conversion_path)
shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path 

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)
    
from transformers import AutoTokenizer,AutoModel,AutoConfig
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
import transformers
tokenizer = DebertaV2TokenizerFast.from_pretrained('/home/xiaoguzai/模型/deberta-v3-base')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
max_len = 360
#下面prepare_input的时候将text和feature_text合在一起很巧妙

def prepare_input(ids, text, feature_text):
    #inputs1 = tokenizer.encode_plus(text,\
    #                               add_special_tokens=True,\
    #                               max_length = max_len,\
    #                               paddin)
    inputs = {}
    inputs1 = tokenizer.encode_plus(text,\
                                   add_special_tokens=True,\
                                   max_length = max_len,\
                                   return_offsets_mapping = False)
    inputs2 = tokenizer.encode_plus(feature_text,\
                                   add_special_tokens=True,\
                                   max_length = max_len,\
                                   return_offsets_mapping = False)
    
    if len(inputs1['input_ids'])+len(inputs2['input_ids'])-2 > max_len:
        exceed_length = len(inputs1['input_ids'])+len(inputs2['input_ids'])-max_len-2
        inputs['input_ids'] = inputs1['input_ids'][:len(inputs1['input_ids'])-exceed_length-1]+inputs2['input_ids'][1:]
        inputs['attention_mask'] = [1]*max_len
    else:
        inputs['input_ids'] = inputs1['input_ids'][:-1]+inputs2['input_ids'][1:-1]
        inputs['input_ids'] = inputs['input_ids']+[tokenizer.sep_token_id]
        #inputs['input_ids'] = inputs['input_ids']+[tokenizer.sep_token_id]+[tokenizer.pad_token_id]*(max_len-len(inputs['input_ids'])-1)
        inputs['attention_mask'] = [1]*(len(inputs['input_ids']))
        inputs['attention_mask'] = inputs['attention_mask']+[0]*(max_len-len(inputs['input_ids']))
        inputs['input_ids'] = inputs['input_ids']+[tokenizer.pad_token_id]*(max_len-len(inputs['input_ids']))
        inputs['token_type_ids'] = [0]*max_len
        #inputs['attention_mask'] = [1]*max_len
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v,dtype=torch.long)
    return inputs,len(inputs['input_ids'])

In [4]:
pseudo_labeling = pd.read_csv('/home/xiaoguzai/数据/NBME-Score Clinical Patient Notes/pseudo_labeling.csv')
features = pd.read_csv('/home/xiaoguzai/数据/NBME-Score Clinical Patient Notes/features.csv')
patient_notes = pd.read_csv('/home/xiaoguzai/数据/NBME-Score Clinical Patient Notes/patient_notes.csv')
features.loc[27,'feature_text'] = "Last-Pap-smear-1-year-ago"
pseudo_labeling = pseudo_labeling.merge(features,on=['feature_num','case_num'],how='left')
pseudo_labeling = pseudo_labeling.merge(patient_notes,on=['pn_num','case_num'],how='left')
pseudo_labeling = pseudo_labeling.fillna('')
pseudo_labeling.head()

Unnamed: 0,id,case_num,pn_num,feature_num,feature_text,pn_history
0,00000_000,0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...,"17-year-old male, has come to the student heal..."
1,00000_001,0,0,1,Family-history-of-thyroid-disorder,"17-year-old male, has come to the student heal..."
2,00000_002,0,0,2,Chest-pressure,"17-year-old male, has come to the student heal..."
3,00000_003,0,0,3,Intermittent-symptoms,"17-year-old male, has come to the student heal..."
4,00000_004,0,0,4,Lightheaded,"17-year-old male, has come to the student heal..."


In [5]:
pseudo_labeling["pn_history"] = pseudo_labeling["pn_history"].apply(lambda x:x.capitalize())

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=42)


In [7]:
def change_location_to_offset(text,location_list):
    results = np.zeros(len(text))
    #以char为级别计算，应该对整个text计算len
    for idx, offset_mapping in enumerate(location_list):
        try:
            start = (int)(offset_mapping[0])
            end = (int)(offset_mapping[1])
            results[start:end] = 1
        except:
            continue
    return results

def create_label(text):
    encoded = tokenizer.encode_plus(text,\
                                add_special_tokens=True,\
                                max_length = max_len,\
                                padding = "max_length",\
                                return_offsets_mapping = True)
    offset_mapping = encoded['offset_mapping']
    return offset_mapping


In [8]:
from tqdm import tqdm
test_ids = []
test_text,test_input_ids,test_token_type_ids = [],[],[]
test_attention_mask,test_offset,test_label = [],[],[]
for  index,data  in  tqdm(pseudo_labeling.iterrows(),total=len(pseudo_labeling)):
    #!!!数据这里出现bug，读取的都是一组数据!!!
    ids = data['id']
    text = data['pn_history']
    feature_text = data['feature_text']
    inputs,length = prepare_input(ids,text,feature_text)
    current_offset = create_label(text)
    test_ids.append(ids)
    test_text.append(text)
    test_input_ids.append(inputs['input_ids'].tolist())
    test_token_type_ids.append(inputs['token_type_ids'].tolist())
    test_attention_mask.append(inputs['attention_mask'].tolist())
    test_offset.append(current_offset)

  0%|                                                  | 0/5715 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████| 5715/5715 [00:03<00:00, 1512.64it/s]


In [9]:
from torch.utils.data import DataLoader,Dataset
class TestDataset(Dataset):
    def __init__(self,text,input_ids,offset,token_type_ids,attention_mask):
        self.input_ids = input_ids
        self.tensors = [text,\
                        torch.tensor(input_ids,dtype=torch.long),
                        torch.tensor(offset),\
                        torch.tensor(token_type_ids,dtype=torch.long),\
                        torch.tensor(attention_mask,dtype=torch.long)]
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,index):
        return tuple(tensor[index] for tensor in self.tensors)

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class ClassificationModel(nn.Module):
    def __init__(self,model):
        super(ClassificationModel,self).__init__()
        self.model = model
        #self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(1024,1)
        
    def forward(self,input_ids,token_type_ids,attention_mask):
        outputs = self.model(input_ids=input_ids,\
                           token_type_ids=token_type_ids,\
                           attention_mask=attention_mask)
        outputs = outputs[0]
        #outputs = self.dropout(outputs)
        outputs = self.fc1(outputs)
        return outputs

In [11]:
def get_char_probs(total_text,offsets,predictions):
    results = [np.zeros(len(t)) for t in total_text]
    #!!!results 长短不一!!!
    #以char为级别计算，应该对整个text计算len
    torch.set_printoptions(threshold=np.inf)
    for i, (offset, prediction) in enumerate(zip(offsets, predictions)):
        for idx, (offset_mapping, pred) in enumerate(zip(offset, prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            #results[i][start:end] = ((float)(pred[0].item(),)
            results[i][start:end] = pred[0].item()
    return results

def get_results(test_text,char_probs, th=0.5):
    results = []
    #for char_prob in char_probs:
    for index in range(len(char_probs)):
        char_prob = char_probs[index]
        char_text = test_text[index]
        #print('char_prob = ')
        #print(char_prob)
        #print('------------')
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        #result = [f"{min(r)} {max(r)}" for r in result]
        result = [[min(r),max(r)] for r in result]
        
        for index1 in range(len(result)):
            if result[index1][0]-1 >= 0 and char_text[result[index1][0]-1] != ' ':
                result[index1][0] = result[index1][0]-1
                #preds[index][index1][0] = preds[index][index1][0]-1
            #if preds[index][index1][1]+1 < len(current_text) and current_text[preds[index][index1][1]+1] != ' ':
            #    preds[index][index1][1] = preds[index][index1][1]+1
        
        result = [str(r[0])+' '+str(r[1]) for r in result]
        #result = ";".join(result)
        results.append(result)
    return results

def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


In [12]:
from transformers import DebertaV2Model,DebertaModel
import itertools
deberta = DebertaV2Model.from_pretrained("/home/xiaoguzai/模型/deberta-v3-large")
#deberta = DebertaModel.from_pretrained("../input/deberta/base")
model = ClassificationModel(deberta)

Some weights of the model checkpoint at /home/xiaoguzai/模型/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
test_dataset = TestDataset(test_text,\
                           test_input_ids,\
                           test_offset,\
                           test_token_type_ids,\
                           test_attention_mask)
test_loader = DataLoader(test_dataset,batch_size=4,shuffle=False,\
                         num_workers=2,pin_memory=True)

final_result = []
r"""
model_list = ['/home/xiaoguzai/程序/NBME-Score Clinical Patient Notes/deberta-v3模型文件/deberta_capatalize_noid_best_point=0.887656_fold=0.pth',\
              '/home/xiaoguzai/程序/NBME-Score Clinical Patient Notes/deberta-v3模型文件/deberta_capatalize_noid_best_point=0.883649_fold=1.pth',\
              '/home/xiaoguzai/程序/NBME-Score Clinical Patient Notes/deberta-v3模型文件/deberta_capatalize_noid_best_point=0.88135_fold=2.pth',\
              '/home/xiaoguzai/程序/NBME-Score Clinical Patient Notes/deberta-v3模型文件/deberta_capatalize_noid_best_point=0.881055_fold=3.pth',\
              '/home/xiaoguzai/程序/NBME-Score Clinical Patient Notes/deberta-v3模型文件/deberta_capatalize_noid_best_point=0.88443_fold=4.pth']
              #'../input/nbmedebertamodelpart2/deberta_nocapitalize_groupkfold_best_point0.8784704033525406_fold3.pth']
"""
r"""
model_list = ['/media/xiaoguzai/WD_BLACK/deberta-large-submit/archive(5)/deberta_Groupsplit_best_point_fold0.pth',\
              '/media/xiaoguzai/WD_BLACK/deberta-large-submit/archive(5)/deberta_Groupsplit_best_point_fold1.pth',\
              '/media/xiaoguzai/WD_BLACK/deberta-large-submit/archive(5)/deberta_Groupsplit_best_point_fold2.pth',\
              '/media/xiaoguzai/WD_BLACK/deberta-large-submit/archive(5)/deberta_Groupsplit_best_point_fold3.pth',\
              '/media/xiaoguzai/WD_BLACK/deberta-large-submit/archive(5)/deberta_Groupsplit_best_point_fold4.pth']
"""
model_list = ['/media/xiaoguzai/WD_BLACK/deberta-v3-large模型文件/deberta_capatalize_noid_best_point0.887656_fold0.pth',\
              '/media/xiaoguzai/WD_BLACK/deberta-v3-large模型文件/deberta_capatalize_noid_best_point0.883649_fold1.pth',\
              '/media/xiaoguzai/WD_BLACK/deberta-v3-large模型文件/deberta_capatalize_noid_best_point0.88135_fold2.pth',\
              '/media/xiaoguzai/WD_BLACK/deberta-v3-large模型文件/deberta_capatalize_noid_best_point0.881055_fold3.pth',\
              '/media/xiaoguzai/WD_BLACK/deberta-v3-large模型文件/deberta_capatalize_noid_best_point0.88443_fold4.pth']
total_split = len(model_list)
for current_split in range(total_split):
    model = torch.load(model_list[current_split])
    model.eval()
    model.to(device)
    final_result = []
    current_result = []
    for batch_text,batch_input_ids,batch_offset,batch_token_type_ids,batch_attention_mask in tqdm(test_loader):
        batch_input_ids = batch_input_ids.to(device)
        batch_token_type_ids = batch_token_type_ids.to(device)
        batch_attention_mask = batch_attention_mask.to(device)
        with torch.no_grad():
            logit = model(input_ids=batch_input_ids,token_type_ids=batch_token_type_ids,\
                          attention_mask=batch_attention_mask)
            logit = torch.sigmoid(logit)
            preds = get_char_probs(batch_text,batch_offset,\
                                   logit.cpu())
            for data in preds:
                current_result.append(data)
    results = get_results(test_text,current_result)
    pseudo_labeling['location'] = results
    pseudo_labeling[['id','case_num','pn_num','feature_num','location']].to_csv('/home/xiaoguzai/数据/NBME-Score Clinical Patient Notes/pseudo_labeling_deberta-v3_split='+str(current_split)+'_train.csv',index=False)

  0%|                                                  | 0/1429 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|                                          | 1/1429 [00:00<05:29,  4.33it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████| 1429/1429 [02:18<00:00, 10.31it/s]
  0%|                                                  | 0/1429 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|                                          | 1/1429 [00:00<04:38,  5.12it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████| 1429/1429 [02:19<00:00, 10.22it/s]
  0%|                                          | 1/1429 [00:00<04:36,  5.16it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████| 1429/1429 [02:14<00:00, 11.26it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████| 1429/1429 [02:14<00:00, 10.62it/s]
  0%|                                          | 1/1429 [00:00<04:27,  5.33it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████| 1429/1429 [02:19<00:00, 10.65it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████| 1429/1429 [02:19<00:00, 10.21it/s]
  0%|                                                  | 0/1429 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|                                          | 1/1429 [00:00<04:38,  5.14it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████| 1429/1429 [02:19<00:00, 10.22it/s]
