# Fine Tuning

Use this section if you want to retrain the model in your own data

In [None]:
#Imports

import pandas as pd
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss, Sigmoid
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, \
    accuracy_score, auc, average_precision_score
from transformers import *
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
import os
import re
import transformers
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset

In [None]:
#Setting seeds

os.environ['PYTHONHASHSEED'] = '2021'
np.random.seed(2021)
torch.manual_seed(2021)

In [None]:
#Important parameters that will be used

n_splits = 5 #number of folds used in cross validation
epochs = 50 #number of training epochs
lr = 2e-5 #learning rate
max_length = 120 #number of tokens per data

In [None]:
#General configs for GPU usage and reference to directories

try:
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
if device == 'cuda':
    torch.cuda.get_device_name(0)


def create_directory(directory_path):
    if os.path.exists(directory_path):
        return None
    else:
        try:
            os.makedirs(directory_path)
        except:
            # in case another machine created the path meanwhile !:(
            return None
        return None


result_dir = './' #insert here your diretory for results
create_directory(result_dir)

In [None]:
data_path = './' #insert here the path of your data
data = pd.read_csv(data_path, sep=',') #load your data

#Here are defined the trained symptons
sintomas = ['calafrio', 'congestão nasal', 'coriza', 'diarreia', 'dor de cabeça',
       'dor muscular', 'dor de garganta', 'febre', 'perda de olfato', 'perda de paladar', 'sonolência', 'tosse',
       'enjoo', 'cansaço'] 

sintomas_ingles = ['chill', 'nasal congestion', 'runny nose', 'diarrhea', 'headache',
       'muscle pain', 'sore throat', 'fever', 'loss of smell', 'loss of taste', 'drowsiness', 'cough',
       'sickness', 'tiredness']


data2 = data[sintomas] #selecting the columns with the labels

col_list = []
for i in range(len(data2)):
  list_linha = []
  for j in range(len(data2.columns)):
    if data2.iloc[i,j] >= 2:
      list_linha.append(1)
    else:
      list_linha.append(data2.iloc[i,j])

  col_list.append(list_linha)

data2['one_hot'] = col_list


texts = list(data['text'].values)
labels = list(data2['one_hot'].values)

labels = np.array(labels)

Here we will pre-process the data to remove links, "RT", emojis, special characters and Twitter's usernames

In [None]:
http = re.compile(r'http.*? |http.* ?')
username = re.compile(r'@.*? |@.* ?')
rt = re.compile(r'rt |RT |rT |Rt ')
www = re.compile(r'www.*? |www.* ?')
com = re.compile(r'.*?\.com.*? |.*?\.com.* ?')
br = re.compile(r'.*?\.br.*? |.*?\.br.* ?')
hashtag = re.compile(r'#.*? |#.* ?')
emojis = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f" 
        u"\u3030"
                      "]+", re.UNICODE)


def remover(textos, regex):
    if type(textos) == str:
        return regex.sub('', textos)
    else:
        return [regex.sub('', texto) for texto in textos]


texts = remover(texts, username)
texts = remover(texts, http)
texts = remover(texts, rt)
texts = remover(texts, www)
texts = remover(texts, com)
texts = remover(texts, br)
texts = remover(texts, hashtag)
texts = remover(texts, emojis)

In [None]:
#Create the tokenizer, split and tokenize your data

tokenizer = transformers.BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)  # tokenizer

encodings = tokenizer.batch_encode_plus(texts, max_length=max_length,
                                        pad_to_max_length=True)  

print('tokenizer outputs: ', encodings.keys())


input_ids = encodings['input_ids']  # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids']  # token type ids
attention_masks = encodings['attention_mask']  # attention masks

input_ids, test_input_ids, labels, test_labels = train_test_split(input_ids, labels, train_size=0.8,
                                                                   random_state=10)
token_type_ids, test_token_type_ids = train_test_split(token_type_ids, train_size=0.8, random_state=10)
attention_masks, test_attention_masks = train_test_split(attention_masks, train_size=0.8, random_state=10)

input_ids = np.array(input_ids)
token_type_ids = np.array(token_type_ids)
attention_masks = np.array(attention_masks)
labels = np.array(labels)

test_input_ids = np.array(test_input_ids)
test_token_type_ids = np.array(test_token_type_ids)
test_attention_masks = np.array(test_attention_masks)
test_labels = np.array(test_labels)

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = [[] for i in range(n_splits)]
train_loss_set_epoch = [[] for i in range(n_splits)]

val_loss_set = [[] for i in range(n_splits)]
val_loss_set_epoch = [[] for i in range(n_splits)]

train_f1_accuracy_set = [[] for i in range(n_splits)]
train_flat_accuracy_set = [[] for i in range(n_splits)]

val_f1_accuracy_set = [[] for i in range(n_splits)]
val_flat_accuracy_set = [[] for i in range(n_splits)]

pred_labels_split = [[] for i in range(n_splits)]
true_labels_split = [[] for i in range(n_splits)]

np_pred_bools_split = [[] for i in range(n_splits)]
np_true_bools_split = [[] for i in range(n_splits)]

models = [[] for i in range(n_splits)]

In [None]:
def save_ckp(state):
    f_path = result_dir + modelname + f'_{str(j)}_best_weights.pt'
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)

#Creates the cross validation framework
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=2021)
kfold.get_n_splits(input_ids)

num_labels = len(labels[0])
n_classes = len(labels[0])

In [None]:
#Training

j = 0

for train_index, validation_index in kfold.split(input_ids, labels):
    print()
    print('--------------------------------------------------------')
    print()
    print(f'Current Fold: {j + 1}/{n_splits}')

    valid_loss_min = 1000
    
    train_inputs, validation_inputs = input_ids[train_index], input_ids[validation_index]
    train_labels, validation_labels = labels[train_index], labels[validation_index]
    train_token_types, validation_token_types = token_type_ids[train_index], token_type_ids[validation_index]
    train_masks, validation_masks = attention_masks[train_index], attention_masks[validation_index]


    # Convert all of our data into torch tensors, the required datatype for our model
    train_inputs = torch.tensor(train_inputs)
    train_labels = torch.tensor(train_labels)
    train_masks = torch.tensor(train_masks)
    train_token_types = torch.tensor(train_token_types)
    validation_inputs = torch.tensor(validation_inputs)
    validation_labels = torch.tensor(validation_labels)
    validation_masks = torch.tensor(validation_masks)
    validation_token_types = torch.tensor(validation_token_types)

    # Select a batch size for training.issues.
    batch_size = 32


    train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    #carrega a BERTimbau com os nossos pesos treinados
    model = transformers.BertForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=num_labels)
    

    model.cuda()
    model.load_state_dict(torch.load(result_dir + modelname + f'_0_best_weights.pt')['state_dict']) #Load our trained weights


    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]

    optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=True)
    optimizer.load_state_dict(torch.load(result_dir + modelname + f'_0_best_weights.pt')['optimizer']) #Load our trained weights

    for _ in trange(epochs, desc="Epoch"):
        # Set our model to training mode
        model.train()

        tr_loss = 0  
        nb_tr_examples, nb_tr_steps = 0, 0

        val_loss = 0
        nb_val_examples, nb_val_steps = 0, 0

        # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
            optimizer.zero_grad()

            # Forward pass
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]
            loss_func = BCEWithLogitsLoss()
            loss = loss_func(logits.view(-1, num_labels).to(device), b_labels.type_as(logits).view(-1, num_labels).to(
                device))
            train_loss_set[j].append(loss.item())

            # Backward pass
            loss.backward()
            optimizer.step()
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss / nb_tr_steps))

        train_loss_set_epoch[j].append(tr_loss / nb_tr_steps)

        # Accuracy of the train
        model.eval()

        logit_preds, true_labels, pred_labels = [], [], []
        for i, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
            with torch.no_grad():
                # Forward pass
                outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                b_logit_pred = outs[0]
                pred_label = torch.sigmoid(b_logit_pred)

                b_logit_pred = b_logit_pred.detach().cpu().numpy()
                pred_label = pred_label.to('cpu').numpy()
                b_labels = b_labels.to('cpu').numpy()

            logit_preds.append(b_logit_pred)
            true_labels.append(b_labels)
            pred_labels.append(pred_label)

        pred_labels = [item for sublist in pred_labels for item in sublist]
        true_labels = [item for sublist in true_labels for item in sublist]

        # Calculate Accuracy
        threshold = 0.50
        pred_bools = [pl > threshold for pl in pred_labels]
        true_bools = [tl == 1 for tl in true_labels]

        np_true_bools = np.array(true_bools)
        np_pred_bools = np.array(pred_bools)

        train_f1_accuracy = f1_score(true_bools, pred_bools, average='micro') * 100
        train_flat_accuracy = accuracy_score(true_bools, pred_bools) * 100

        train_f1_accuracy_set[j].append(train_f1_accuracy)
        train_flat_accuracy_set[j].append(train_flat_accuracy)

        # print('F1 Train Accuracy: ', train_f1_accuracy)
        print('Train Accuracy: ', train_flat_accuracy)


        # Validation

        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

        for i, batch in enumerate(validation_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
            with torch.no_grad():
                # Forward pass
                outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                b_logit_pred = outs[0]
                loss_func = BCEWithLogitsLoss()
                loss = loss_func(b_logit_pred.view(-1, num_labels), b_labels.type_as(logits).view(-1,
                                                                                                  num_labels))  # convert labels to float for calculation
                val_loss_set[j].append(loss.item())

                pred_label = torch.sigmoid(b_logit_pred)

                b_logit_pred = b_logit_pred.detach().cpu().numpy()
                pred_label = pred_label.to('cpu').numpy()
                b_labels = b_labels.to('cpu').numpy()

            tokenized_texts.append(b_input_ids)
            logit_preds.append(b_logit_pred)
            true_labels.append(b_labels)
            pred_labels.append(pred_label)

            val_loss += loss.item()
            nb_val_examples += b_input_ids.size(0)
            nb_val_steps += 1
        print("Validation loss: {}".format(val_loss / nb_tr_steps))

        val_loss_set_epoch[j].append(val_loss / nb_val_steps)

        # create checkpoint variable and add important data
        checkpoint = {
            'epoch': _ + 1,
            'valid_loss_min': val_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }

        if (val_loss / nb_val_steps) <= valid_loss_min:
            save_ckp(checkpoint)
            valid_loss_min = val_loss / nb_val_steps

        pred_labels = [item for sublist in pred_labels for item in sublist]
        true_labels = [item for sublist in true_labels for item in sublist]

        pred_labels_split[j].append(pred_labels)
        true_labels_split[j].append(true_labels)

        # Calculate Accuracy
        threshold = 0.50
        pred_bools = [pl > threshold for pl in pred_labels]
        true_bools = [tl == 1 for tl in true_labels]

        np_true_bools = np.array(true_bools)
        np_pred_bools = np.array(pred_bools)

        np_true_bools_split[j].append(np_true_bools)
        np_pred_bools_split[j].append(np_pred_bools)

        val_f1_accuracy = f1_score(true_bools, pred_bools, average='micro') * 100
        val_flat_accuracy = accuracy_score(true_bools, pred_bools) * 100

        val_f1_accuracy_set[j].append(val_f1_accuracy)
        val_flat_accuracy_set[j].append(val_flat_accuracy)

        print('Validation Accuracy: ', val_flat_accuracy)

    models[j] = model
    j += 1


In [None]:
#Loss evolution

x = np.linspace(1, epochs, epochs)

val_median = np.percentile(np.vstack(val_loss_set_epoch), 50.0,
                           axis=0)  # mediana é menos sensível a outliers que a média
best_epoch = val_median.argmin()
print(f'Best Epoch: {best_epoch}')

values_folds = []
for i in range(n_splits):
    values_folds.append(val_loss_set_epoch[i][best_epoch])

best_fold = np.array(values_folds).argmin()
print(f'Best Fold: {best_fold}')

train_median = np.percentile(np.vstack(train_loss_set_epoch), 50.0,
                       axis=0)  
train_lowlim = np.percentile(np.vstack(train_loss_set_epoch), 15.87, axis=0)
train_highlim = np.percentile(np.vstack(train_loss_set_epoch), 84.13, axis=0)
train_interval = (train_highlim - train_lowlim)
train_frac_err = train_interval / train_median
train_half_interval = train_interval / 2.0

plt.title('Loss using Median and Percentiles', fontsize=15)
plt.plot(x, train_median, label=r'Train Loss', color='C0')
plt.fill_between(x, train_lowlim, train_highlim, color='C0', alpha=0.4)


val_median = np.percentile(np.vstack(val_loss_set_epoch), 50.0, axis=0)  
val_lowlim = np.percentile(np.vstack(val_loss_set_epoch), 15.87, axis=0)
val_highlim = np.percentile(np.vstack(val_loss_set_epoch), 84.13, axis=0)
val_interval = (val_highlim - val_lowlim)
val_frac_err = val_interval / val_median
val_half_interval = val_interval / 2.0

plt.plot(x, val_median, label=r'Val Loss', color='C1')
plt.fill_between(x, val_lowlim, val_highlim, color='C1', alpha=0.4)

plt.xlabel(r'Epochs', fontsize=12)
plt.ylabel(r'Loss', fontsize=12)
plt.xticks(np.arange(0, epochs + 1, 10))
plt.xlim(1, epochs)
# plt.ylim(0)
plt.legend([f'Train: {train_median[best_epoch].round(5)} +/- {train_half_interval[best_epoch].round(5)}',
            f'Validation: {val_median[best_epoch].round(5)} +/- {val_half_interval[best_epoch].round(5)}'],
           fontsize=12)

plt.annotate(f'Best epoch: {best_epoch}',
             xy=(1, 0.03),
             fontsize=8)

plt.show()


print(f"Best fold's Train loss: {np.float64(val_loss_set_epoch[best_fold][best_epoch]).round(5)}")
print(f"Best fold's Validation loss: {np.float64(train_loss_set_epoch[best_fold][best_epoch]).round(5)}")

In [None]:
# Accuracy evolutions

train_median = np.percentile(np.vstack(train_flat_accuracy_set), 50.0,
                       axis=0)  
train_lowlim = np.percentile(np.vstack(train_flat_accuracy_set), 15.87, axis=0)
train_highlim = np.percentile(np.vstack(train_flat_accuracy_set), 84.13, axis=0)
train_interval = (train_highlim - train_lowlim)
train_frac_err = train_interval / train_median  
train_half_interval = train_interval / 2.0

plt.title('Validation and Train Accuracys', fontsize=15)
plt.plot(x, train_median, label=r'Train Flat Acc', color='C0')
plt.fill_between(x, train_lowlim, train_highlim, color='C0', alpha=0.4)

print(f'Train Acc: {train_median[best_epoch].round(10)} +/- {train_half_interval[best_epoch].round(10)}')

val_median = np.percentile(np.vstack(val_flat_accuracy_set), 50.0,
                       axis=0)  
val_lowlim = np.percentile(np.vstack(val_flat_accuracy_set), 15.87, axis=0)
val_highlim = np.percentile(np.vstack(val_flat_accuracy_set), 84.13, axis=0)
val_interval = (val_highlim - val_lowlim)
val_frac_err = val_interval / val_median
val_half_interval = val_interval / 2.0

plt.plot(x, val_median, label=r'Val Flat Acc', color='C1')
plt.fill_between(x, val_lowlim, val_highlim, color='C1', alpha=0.4)

plt.xlabel(r'Epochs')
plt.ylabel(r'Accuracy')
plt.xlim(1, epochs)

plt.legend([f'Train: {train_median[best_epoch].round(2)} +/- {train_half_interval[best_epoch].round(2)}',
            f'Validation: {val_median[best_epoch].round(2)} +/- {val_half_interval[best_epoch].round(2)}'],
           fontsize=12)



print(f'Validation Acc: {val_median[best_epoch].round(5)} +/- {val_half_interval[best_epoch].round(5)}')
plt.show()

# Testing

If you fine tuned the model using the last section, don't run the next cell.

If you want to just test the model in your data, without fine tuning, load your data and run the next cell.

In [None]:
models = [[] for i in range(n_splits)]

for j in range(n_splits):
    model = transformers.BertForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=14)

    model.to(device)
    model.load_state_dict(torch.load(result_dir + modelname + f'_{j}_best_weights.pt')['state_dict'])

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]

    optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=True)
    optimizer.load_state_dict(torch.load(result_dir + modelname + f'_{j}_best_weights.pt')['optimizer'])
    
    models[j] = model

In [None]:
#From now on, we will only use the test dataset

test_pred_labels_split = [[] for i in range(n_splits)]
test_true_labels_split = [[] for i in range(n_splits)]

test_np_true_bools_split = [[] for i in range(n_splits)]
test_np_pred_bools_split = [[] for i in range(n_splits)]

test_input_ids = torch.LongTensor(test_input_ids)
test_attention_masks = torch.Tensor(test_attention_masks)
test_labels = torch.Tensor(test_labels)
test_token_type_ids = torch.Tensor(test_token_type_ids)

j=0
for model in models:
        test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels, test_token_type_ids)
        test_dataloader = DataLoader(test_data, batch_size=batch_size)

        model.eval()

        logit_preds, true_labels, pred_labels = [], [], []

        # Predict
        for i, batch in enumerate(test_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
            with torch.no_grad():
                # Forward pass
                outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                b_logit_pred = outs[0]
                pred_label = torch.sigmoid(b_logit_pred)

                b_logit_pred = b_logit_pred.detach().cpu().numpy()
                pred_label = pred_label.to('cpu').numpy()
                b_labels = b_labels.to('cpu').numpy()

            logit_preds.append(b_logit_pred)
            true_labels.append(b_labels)
            pred_labels.append(pred_label)

        pred_labels = [item for sublist in pred_labels for item in sublist]
        true_labels = [item for sublist in true_labels for item in sublist]
        
        test_pred_labels_split[j].append(pred_labels)
        test_true_labels_split[j].append(true_labels)
        
        # Calculate Accuracy
        threshold = 0.50
        pred_bools = [pl > threshold for pl in pred_labels]
        true_bools = [tl == 1 for tl in true_labels]

        np_true_bools = np.array(true_bools)
        np_pred_bools = np.array(pred_bools)

        test_np_true_bools_split[j].append(np_true_bools)
        test_np_pred_bools_split[j].append(np_pred_bools)
        
        j+=1

In [None]:
#Calculations for Precision-Recall Curve

PRECISIONs = []
RECALLs = []

for k in range(n_classes):
    PRECISIONs.append(np.array(precision)[:, k])
    RECALLs.append(np.array(recall)[:, k])

PRECISIONs = np.array(PRECISIONs)
RECALLs = np.array(RECALLs)

PREDs = np.array(test_np_pred_bools_split)[:, 0]
REALs = np.array(test_np_true_bools_split)[:, 0]

average_precision = [[[] for i in range(n_splits)] for i in range(n_classes)]
for k in range(n_splits):
    for i in range(n_classes):
        average_precision[i][k].append(
            average_precision_score(test_np_true_bools_split[k][0][:, i], test_np_pred_bools_split[k][0][:, i]))

In [None]:
plt.figure(figsize=(8, 8))
for k in range(n_classes):
    plt.plot(np.percentile(RECALLs[k], 50, axis=0), np.percentile(PRECISIONs[k], 50, axis=0),
             label=f'{sintomas_ingles[k]}: {np.percentile(average_precision[k], 50, axis=0).round(2)}',
             color=f'C{k}')
    
    #Use the next 2 lines with you want to see the errors
    #plt.fill_between(np.percentile(RECALLs[k], 50, axis=0)[:, 0], np.percentile(PRECISIONs[k], 84.13, axis=0)[:, 0],
    #                 np.percentile(PRECISIONs[k], 15.87, axis=0)[:, 0], color=f'C{k}', alpha=0.4)

    print(r'Classe %s : %.3f +/- %.3f' % (sintomas_ingles[k], np.percentile(average_precision[k], 50, axis=0),
                                                                 (np.percentile(average_precision[k], 84.13,
                                                                                axis=0) - np.percentile(
                                                                     average_precision[k], 15.87, axis=0)) / 2.))
plt.xlabel(r'Recall')
plt.ylabel(r'Precision')
plt.legend(loc='best')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
#Calculations for ROC Curve

fpr = [[[[] for i in range(len(threshold))] for i in range(n_classes)] for i in range(n_splits)]
tpr = [[[[] for i in range(len(threshold))] for i in range(n_classes)] for i in range(n_splits)]

for k in range(n_splits):
    for j in range(n_classes):
        for t in range(len(threshold)):
            try:
                tpr[k][j][t].append(TP[k][j][t] / (TP[k][j][t] + FN[k][j][t]))
            except:
                tpr[k][j][t].append(1)
            try:
                fpr[k][j][t].append(FP[k][j][t] / (FP[k][j][t] + TN[k][j][t]))
            except:
                fpr[k][j][t].append(1)


FPRs = []
TPRs = []

for k in range(n_classes):
    FPRs.append(np.array(fpr)[:, k])
    TPRs.append(np.array(tpr)[:, k])

FPRs = np.array(FPRs)
TPRs = np.array(TPRs)

In [None]:
plt.figure(figsize=(8, 8))
for k in range(n_classes):
    plt.plot(np.percentile(FPRs[k], 50, axis=0), np.percentile(TPRs[k], 50, axis=0),
             label=f'{sintomas[k]}: {auc(np.percentile(FPRs[k], 50, axis=0), np.percentile(TPRs[k], 50, axis=0)).round(2)}', color=f'C{k}')
    
    #Use the next 2 lines with you want to see the errors
    #plt.fill_between(np.percentile(FPRs[k], 50, axis=0)[:, 0], np.percentile(TPRs[k], 84.13, axis=0)[:, 0],
    #                 np.percentile(TPRs[k], 15.87, axis=0)[:, 0], color=f'C{k}', alpha=0.4)
    
plt.xlabel(r'TPR')
plt.ylabel(r'FPR')
plt.title('ROC Curve')
plt.legend()
plt.savefig(
    result_dir + f'NTv3_model{modelname}_n_classes{n_classes}_n_folds{n_splits}_epochs{epochs}_lr{lr}_maxlenght{max_length}_trainsize{train_size}' + 'roc_com_erros.png')
plt.show()

# LIME

Here we will only the best model trained. In our data it was the model 0, so the example will use this one.

In [1]:
model = models[0]

NameError: name 'models' is not defined

In [None]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer

In [None]:
#This code will create a LIME function for multilabel data that already preprocess our data

def multi_label_explainer(input_text, labels=sintomas, num_features=None, num_samples=None, bow=True):
    
    http = re.compile(r'http.*? |http.* ?')
    username = re.compile(r'@.*? |@.* ?')
    rt = re.compile(r'rt |RT |rT |Rt ')
    www = re.compile(r'www.*? |www.* ?')
    com = re.compile(r'.*?\.com.*? |.*?\.com.* ?')
    br = re.compile(r'.*?\.br.*? |.*?\.br.* ?')
    hashtag = re.compile(r'#.*? |#.* ?')
    emojis = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002500-\U00002BEF"  # chinese char
                        u"\U00002702-\U000027B0"
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        u"\U0001f926-\U0001f937"
                        u"\U00010000-\U0010ffff"
                        u"\u2640-\u2642" 
                        u"\u2600-\u2B55"
                        u"\u200d"
                        u"\u23cf"
                        u"\u23e9"
                        u"\u231a"
                        u"\ufe0f"  # dingbats
                        u"\u3030"
                                      "]+", re.UNICODE)

    def remover(textos, regex):
        if type(textos) == str:
            return regex.sub('', textos)
        else:
            return [regex.sub('', texto) for texto in textos]


    input_text = remover(input_text, username)
    input_text = remover(input_text, http)
    input_text = remover(input_text, rt)
    input_text = remover(input_text, www)
    input_text = remover(input_text, com)
    input_text = remover(input_text, br)
    input_text = remover(input_text, hashtag)
    input_text = remover(input_text, emojis)
    
    for label in labels:
        class_names = ['None', label]

        def make_classifier_pipeline(label=label):
            label_index = labels.index(label)
    
            def lime_explainer_pipeline(texts):    

                encodings = tokenizer.batch_encode_plus(texts,max_length=150,pad_to_max_length=True)

                test_input_ids = encodings['input_ids'] # tokenized and encoded sentences
                test_token_type_ids = encodings['token_type_ids'] # token type ids
                test_attention_masks = encodings['attention_mask'] # attention masks

                test_input_ids = np.array(test_input_ids)
                test_token_type_ids = np.array(test_token_type_ids)
                test_attention_masks = np.array(test_attention_masks)

                test_input_ids = torch.tensor(test_input_ids)
                test_token_type_ids = torch.tensor(test_token_type_ids)
                test_attention_masks = torch.tensor(test_attention_masks)

                test_input_ids = test_input_ids.to(device)
                test_token_type_ids = test_token_type_ids.to(device)
                test_attention_masks = test_attention_masks.to(device)

                model.to(device)
                model.eval()


                with torch.no_grad():
                    outs = model(test_input_ids, token_type_ids=None, attention_mask=test_attention_masks)
                    b_logit_pred = outs[0]
                    pred_label = torch.sigmoid(b_logit_pred)

                    pred_label = pred_label.to('cpu').numpy()


                prob_true = pred_label[:, label_index]
    
                result = np.transpose(np.vstack(([1-prob_true, prob_true])))  
                result  = result.reshape(-1, 2)
                return result
        
            return lime_explainer_pipeline
       
       # make a classifier function for the required label
        classifer_fn = make_classifier_pipeline(label=label)

        if num_samples is None:
            num_samples = int(len(input_text.split(' ')) * 2.5)
            num_samples = 1000 if num_samples > 1000 else num_samples
        if num_features is None:
            num_features = int(len(input_text.split(' ')) // 20)
            num_features = 10 if num_features > 10 else max(num_features,1)

        explainer = LimeTextExplainer(class_names=class_names, kernel_width=25, bow=bow)
        exp = explainer.explain_instance(input_text, classifer_fn, num_features=num_features, num_samples=num_samples)
        exp.show_in_notebook(text=True, predict_proba=True)

In [None]:
#Here is an example of how to call the function

multi_label_explainer(texts[10], num_features=2)