# BERT multilabel sequence classification 

Model for sequence classification 

Input : should an extracted entity from clinical text
Output : one label or more from MeSH labels

Training set : mesh_term_dataset

First step : sequence classification with CamembertForSequenceClassification
Second step : Raw BERT + CNN

In [71]:
import numpy as np 
import pandas as pd 
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "2" # ou un autre numéro, d'après ce qui est dispo en faisant nvidia-sm

import re

from transformers import CamembertTokenizer, CamembertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import torch
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader

from torch.nn import BCEWithLogitsLoss, Sigmoid


from sklearn.model_selection import train_test_split


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device',device)

# the model : 
bert_name = "/export/home/cse200093/camembert/camembert-base/"


device(type='cpu')

In [72]:
# load the training file 
df_train = pd.read_csv("/export/home/cse200093/deft_2021/mesh_term_dataset.csv")

# load the test file, i.e. the sosy and path predicted entities from Pyner output
df_test = pd.read_csv("/export/home/cse200093/deft_2021/pred_term_dataset.csv")

print(df_train.shape)
df_train.head(), df_test.head()

(9383, 3)


Unnamed: 0,term,label,source
0,épileptique,nerveux,DEFT-train
1,légère somnolence,etatsosy,DEFT-train
2,10 fois la dose,chimiques,DEFT-train
3,vomissements,etatsosy,DEFT-train
4,retard de croissance intra-utérin,etatsosy,DEFT-train


In [73]:
# fusion "homme" and "femme" label to "urogen"
df_train.replace('femme','urogen', inplace = True)
df_train.replace('homme','urogen', inplace = True)

# Transform the dataFrame so that each label "nerveux", "etatsosy", "chimiques" etc... is a column
# and all the values are 0 or 1
col_label = ['ORL','blessures','cardiovasculaires','chimiques','digestif', 'endocriniennes', 
             'etatsosy','urogen','genetique','hemopathies','immunitaire','infections',
             'nerveux', 'nutritionnelles', 'oeil','osteomusculaires','parasitaires','peau',
             'respiratoire','stomatognathique','tumeur','virales']


# transform the labeled data to a one-hot vector
df_train = pd.get_dummies(df_train,prefix = None, columns = ['label'])
display(df_train.columns)
df_train.head(5)


Index(['term', 'source', 'label_ORL', 'label_blessures',
       'label_cardiovasculaires', 'label_chimiques', 'label_digestif',
       'label_endocriniennes', 'label_etatsosy', 'label_genetique',
       'label_hemopathies', 'label_immunitaire', 'label_infections',
       'label_nerveux', 'label_nutritionnelles', 'label_oeil',
       'label_osteomusculaires', 'label_parasitaires', 'label_peau',
       'label_respiratoire', 'label_stomatognathique', 'label_tumeur',
       'label_urogen', 'label_virales'],
      dtype='object')

Unnamed: 0,term,source,label_ORL,label_blessures,label_cardiovasculaires,label_chimiques,label_digestif,label_endocriniennes,label_etatsosy,label_genetique,...,label_nutritionnelles,label_oeil,label_osteomusculaires,label_parasitaires,label_peau,label_respiratoire,label_stomatognathique,label_tumeur,label_urogen,label_virales
0,épileptique,DEFT-train,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,légère somnolence,DEFT-train,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,10 fois la dose,DEFT-train,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,vomissements,DEFT-train,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,retard de croissance intra-utérin,DEFT-train,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
# Pre-processing of the text : removing the non-alphanumérique data : 
import unidecode

def strip(text):
    pattern = r"[^a-zA-z0-9\s,']"
    text = unidecode.unidecode(text)
    text = re.sub(pattern,'',text)
    return text

df_train['term'] = df_train['term'].apply(strip)
df_test['term'] = df_test['term'].apply(strip)

df_train.head(), df_test.head()

Unnamed: 0,term,source,label_ORL,label_blessures,label_cardiovasculaires,label_chimiques,label_digestif,label_endocriniennes,label_etatsosy,label_genetique,...,label_nutritionnelles,label_oeil,label_osteomusculaires,label_parasitaires,label_peau,label_respiratoire,label_stomatognathique,label_tumeur,label_urogen,label_virales
0,epileptique,DEFT-train,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,legere somnolence,DEFT-train,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,10 fois la dose,DEFT-train,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,vomissements,DEFT-train,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,retard de croissance intrauterin,DEFT-train,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
labels = df_train[['label_ORL', 'label_blessures',
       'label_cardiovasculaires', 'label_chimiques', 'label_digestif',
       'label_endocriniennes', 'label_etatsosy', 'label_genetique',
       'label_hemopathies', 'label_immunitaire', 'label_infections',
       'label_nerveux', 'label_nutritionnelles', 'label_oeil',
       'label_osteomusculaires', 'label_parasitaires', 'label_peau',
       'label_respiratoire', 'label_stomatognathique', 'label_tumeur',
       'label_urogen', 'label_virales']].to_numpy()

In [76]:
# pre-processing:

Max_len = 128 # just to reminder : 13 on 9383 samples are larger than 128.

train_sentences = list(df_train['term'])
test_sentences = list(df_test['term'])

# split the train set into a training and validation dataset :
train_texts, val_texts, train_labels, val_labels = train_test_split(train_sentences, labels, test_size=.2)


# define the tokenizer : 
tokenizer = CamembertTokenizer.from_pretrained(bert_name, do_lower_case = True)

# tokenization of the train, val and test dataset : 
train_tokenizer_texts = tokenizer(train_texts, return_tensors = 'pt', padding=True, truncation=True, max_length = Max_len)
val_tokenizer_texts = tokenizer(val_texts, return_tensors = 'pt', padding=True, truncation=True, max_length = Max_len)
test_tokenizer_texts = tokenizer(test_sentences, return_tensors = 'pt',padding=True, truncation=True, max_length = Max_len)


print('train_size ', train_tokenizer_texts['input_ids'].size())
print('val_size ', val_tokenizer_texts['input_ids'].size())
print('test_size ', test_tokenizer_texts['input_ids'].size())

attention_masks = train_tokenizer_texts['attention_mask']
val_attention_masks = val_tokenizer_texts['attention_mask']
test_attention_masks = test_tokenizer_texts['attention_mask']


#attention_masks[0] , val_attention_masks[0], test_attention_masks[0]
train_labels.shape, val_labels.shape

HBox(children=(FloatProgress(value=0.0, max=9383.0), HTML(value='')))




array([  403,   228, 10286,   374, 17937,    62,  3737,  1603,   403,
         229,  4495,   374,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0]

In [77]:
#Dataset wrapping tensors.

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_tokenizer_texts, train_labels)
val_dataset = IMDbDataset(val_tokenizer_texts, val_labels)

#test_dataset = IMDbDataset(test_tokenizer_texts, test_labels)


HBox(children=(FloatProgress(value=0.0, max=9383.0), HTML(value='')))




array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [78]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_labels = 22

model = CamembertForSequenceClassification.from_pretrained(bert_name, num_labels = num_labels)

model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)


# define the parameters 
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optim = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=True)


for epoch in range(50):
    ### training step 
    print("Training Mode for epoch n°", epoch)
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0  
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
        #Calculate the loss between multilabel predicted outputs and actuals
        logits = outputs[0]
        loss_func = BCEWithLogitsLoss() 
        loss = loss_func(logits.view(-1,num_labels), labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        # loss = loss_func(outputs.logits, labels.type_as(outputs.logits))
        # loss = outputs[0]
        loss.backward()
        optim.step()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
    #Print the current training loss 
    print("Train Loss: {}".format(tr_loss/nb_tr_examples))
    
    
    # PREDICT : 
    print("Evaluation Mode for epoch n°", epoch)
    model.eval()
    # Variables to gather full output
    logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]
    
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
            logit_pred = outs[0]
            pred_label = torch.sigmoid(logit_pred)
            logit_pred = logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            labels = labels.to('cpu').numpy()
            tokenized_texts.append(input_ids)
            logit_preds.append(logit_pred)
            true_labels.append(labels)
            pred_labels.append(pred_label)
    
    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]
    
    # Calculate Accuracy
    threshold = 0.2
    pred_bools = [pl>threshold for pl in pred_labels]
    true_bools = [tl==1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100
    
    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)

    
# saving the model 
model.save_pretrained("classifier_Base_50.pt")