In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os
#import json
import torch

import time

from tqdm import tqdm

In [3]:
###setting directory
data_dir="data"
model_dir="models"
output_dir="output"
###

# Finetune model: load dataset

In [4]:
#change
from datasets import Dataset
#PVE_input = Dataset.from_pandas(pd.read_csv('data\PVE_covid_annotation_v0_1.csv', encoding='latin1',index_col=0))
PVE_input =pd.read_excel('data\PVE_covid_annotation_v4_annotated.xlsx', index_col=0)
train_set=Dataset.from_pandas(PVE_input[(PVE_input['AL_batch']>=0) & (PVE_input['AL_batch']<=4)])
test_set=Dataset.from_pandas(PVE_input[PVE_input['BL_batch']==-1])

In [5]:
train_set

Dataset({
    features: ['opinion', 'project', 'random_order', 'BL_batch', 'AL_batch', 'Autonomy_for_companies', 'Autonomy_for_close_contact_professions', 'Quality_of_life_for_elderly', 'Family_value', 'Feasibility', 'Acceptance_of_exception', 'The_need_of_schooling', 'Working_safety', 'Well-being', 'Reduce_adverse_effect', 'step_0_margin', 'step_0_rank', 'step_1_margin', 'step_1_rank', 'step_2_margin', 'step_2_rank', 'step_3_margin', 'step_3_rank', 'step_4_margin', 'step_4_rank', 'step_5_margin', 'step_5_rank', 'step_6_margin', 'step_6_rank', 'step_7_margin', 'step_7_rank', 'step_8_margin', 'step_8_rank', 'step_9_margin', 'step_9_rank', '__index_level_0__'],
    num_rows: 250
})

In [6]:
labels = ['Autonomy_for_companies',
 'Autonomy_for_close_contact_professions',
 'Quality_of_life_for_elderly',
 'Family_value',
 'Feasibility',
 'Acceptance_of_exception',
 'The_need_of_schooling',
 'Working_safety',
 'Well-being',
 'Reduce_adverse_effect']

In [7]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [8]:
label2id 

{'Autonomy_for_companies': 0,
 'Autonomy_for_close_contact_professions': 1,
 'Quality_of_life_for_elderly': 2,
 'Family_value': 3,
 'Feasibility': 4,
 'Acceptance_of_exception': 5,
 'The_need_of_schooling': 6,
 'Working_safety': 7,
 'Well-being': 8,
 'Reduce_adverse_effect': 9}

# Preprocess data

In [9]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    # take a batch of texts
    #text = examples["Tweet"]
    # encode them
    encoding = tokenizer(examples['project'], examples['opinion'], truncation=True)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(examples['opinion']), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()
  
    return encoding


In [10]:
encoded_train_set = train_set.map(preprocess_data, batched=True, remove_columns=train_set.column_names)
encoded_test_set = test_set.map(preprocess_data, batched=True, remove_columns=test_set.column_names)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [11]:
example =encoded_train_set[0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [12]:
tokenizer.decode(example['input_ids'])

'[CLS] nursing homes allow visitors again [SEP] loneliness in the last phase of life is inhumane and also leads to premature death. [SEP]'

In [13]:
example['labels']

[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]

In [14]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['Quality_of_life_for_elderly', 'Well-being']

In [15]:
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
    encoded_train_set.set_format("torch", device='cuda:0')
    encoded_test_set.set_format("torch", device='cuda:0')
else:
    encoded_train_set.set_format("torch")
    encoded_test_set.set_format("torch")
    print('No GPU available, using the CPU instead.')


There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce MX550


# Define model

In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('./models/bert_train_level1/' , 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id, ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/bert_train_level1/ and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([54, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([54]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
    model.to('cuda')
else:    
    print('No GPU available, using the CPU instead.')


There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce MX550


# Train the model

In [18]:
batch_size = 4
#batch_size = 8
metric_name = "f1"

In [19]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    #evaluation_strategy = "epoch",
    #save_strategy = "epoch",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    learning_rate=2e-5,
    #learning_rate=2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #num_train_epochs=40,
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_steps=100,
    #push_to_hub=True,
)

In [20]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [21]:
encoded_train_set[0]['labels'].type()

'torch.cuda.FloatTensor'

In [22]:
encoded_train_set['input_ids'][0]

tensor([  101,  8329,  5014,  3499,  5731,  2153,   102, 20334,  1999,  1996,
         2197,  4403,  1997,  2166,  2003, 29582,  2063,  1998,  2036,  5260,
         2000, 21371,  2331,  1012,   102], device='cuda:0')

In [23]:
encoded_train_set['input_ids'][0].device

device(type='cuda', index=0)

In [24]:
#forward pass
outputs = model(input_ids=encoded_train_set['input_ids'][0].unsqueeze(0), labels=encoded_train_set['labels'][0].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.6203, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-1.0278, -0.5184,  0.9243,  0.2122,  0.5602, -0.2516, -1.2529,  0.0315,
         -0.3269,  0.3327]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [25]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_set,
    eval_dataset=encoded_test_set,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [26]:
trainer.train()

***** Running training *****
  Num examples = 250
  Num Epochs = 30
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1890
  Number of trainable parameters = 109489930
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
100,0.4402,0.316952,0.490476,0.664799,0.24
200,0.3004,0.261038,0.70632,0.797338,0.38
300,0.226,0.224674,0.703008,0.793338,0.375
400,0.1675,0.204873,0.720887,0.806072,0.425
500,0.1332,0.193194,0.753472,0.838063,0.43
600,0.1057,0.193385,0.756757,0.846805,0.44
700,0.0882,0.181771,0.782161,0.857159,0.485
800,0.0733,0.176659,0.77265,0.852721,0.47
900,0.0664,0.179676,0.769759,0.849759,0.48
1000,0.0595,0.177302,0.763203,0.848282,0.45


***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
Saving model checkpoint to bert-finetuned-sem_eval-english\checkpoint-500
Configuration saved in bert-finetuned-sem_eval-english\checkpoint-500\config.json
Model weights saved in bert-finetuned-sem_eval-english\checkpoint-500\pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english\checkpoint-500\tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english\checkpoint-500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
****

TrainOutput(global_step=1890, training_loss=0.1077519970596152, metrics={'train_runtime': 2714.5448, 'train_samples_per_second': 2.763, 'train_steps_per_second': 0.696, 'total_flos': 357272918267328.0, 'train_loss': 0.1077519970596152, 'epoch': 30.0})

# Evaluate

In [27]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 4


{'eval_loss': 0.17393480241298676,
 'eval_f1': 0.7787307032590052,
 'eval_roc_auc': 0.8552354877624092,
 'eval_accuracy': 0.47,
 'eval_runtime': 14.0656,
 'eval_samples_per_second': 14.219,
 'eval_steps_per_second': 3.555,
 'epoch': 30.0}

# Inference

In [28]:
test_pred=trainer.predict(encoded_test_set).predictions
test_pred = torch.from_numpy(test_pred)

***** Running Prediction *****
  Num examples = 200
  Batch size = 4


In [29]:
pd.DataFrame(1*(test_pred.sigmoid()>0.5)).sum(axis=0)

0    67
1    42
2    15
3    17
4     5
5    26
6     2
7     3
8    67
9    32
dtype: int64

In [30]:
from sklearn.metrics import  classification_report

print(classification_report(PVE_input[PVE_input['BL_batch']==-1].iloc[:, 5:15],np.matrix(1*(test_pred.sigmoid()>0.5))))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93        64
           1       0.95      0.98      0.96        41
           2       1.00      0.71      0.83        21
           3       0.59      0.71      0.65        14
           4       0.80      0.29      0.42        14
           5       0.73      0.58      0.64        33
           6       0.00      0.00      0.00         4
           7       1.00      0.19      0.32        16
           8       0.78      0.78      0.78        67
           9       0.72      0.70      0.71        33

   micro avg       0.82      0.74      0.78       307
   macro avg       0.75      0.59      0.62       307
weighted avg       0.83      0.74      0.76       307
 samples avg       0.74      0.68      0.69       307



In [31]:
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(PVE_input[PVE_input['BL_batch']==-1].iloc[:, 5:15],np.matrix(1*(test_pred.sigmoid()>0.5)))

array([[[130,   6],
        [  3,  61]],

       [[157,   2],
        [  1,  40]],

       [[179,   0],
        [  6,  15]],

       [[179,   7],
        [  4,  10]],

       [[185,   1],
        [ 10,   4]],

       [[160,   7],
        [ 14,  19]],

       [[194,   2],
        [  4,   0]],

       [[184,   0],
        [ 13,   3]],

       [[118,  15],
        [ 15,  52]],

       [[158,   9],
        [ 10,  23]]], dtype=int64)

In [32]:
model.save_pretrained('models')

Configuration saved in models\config.json
Model weights saved in models\pytorch_model.bin


# Select cases for active sampling

In [33]:
PVE_input_selection=PVE_input[np.isnan(PVE_input['AL_batch'])]
selection_set=Dataset.from_pandas(PVE_input_selection)
#selection_set=Dataset.from_pandas(PVE_input[np.isnan(PVE_input['AL_batch'])][0:100])

In [34]:
selection_set

Dataset({
    features: ['opinion', 'project', 'random_order', 'BL_batch', 'AL_batch', 'Autonomy_for_companies', 'Autonomy_for_close_contact_professions', 'Quality_of_life_for_elderly', 'Family_value', 'Feasibility', 'Acceptance_of_exception', 'The_need_of_schooling', 'Working_safety', 'Well-being', 'Reduce_adverse_effect', 'step_0_margin', 'step_0_rank', 'step_1_margin', 'step_1_rank', 'step_2_margin', 'step_2_rank', 'step_3_margin', 'step_3_rank', 'step_4_margin', 'step_4_rank', 'step_5_margin', 'step_5_rank', 'step_6_margin', 'step_6_rank', 'step_7_margin', 'step_7_rank', 'step_8_margin', 'step_8_rank', 'step_9_margin', 'step_9_rank', '__index_level_0__'],
    num_rows: 59011
})

In [35]:
encoded_selection_set = selection_set.map(preprocess_data, batched=True, remove_columns=selection_set.column_names)

Map:   0%|          | 0/59011 [00:00<?, ? examples/s]

In [36]:
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
    encoded_selection_set.set_format("torch", device='cuda:0')
else:
    encoded_selection_set.set_format("torch")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce MX550


In [37]:
encoded_selection_set=encoded_selection_set.remove_columns(['labels'])

In [38]:
selection_pred=trainer.predict(encoded_selection_set).predictions
selection_pred = torch.from_numpy(selection_pred)

***** Running Prediction *****
  Num examples = 59011
  Batch size = 4


In [39]:
#pd.DataFrame(1*(selection_pred.sigmoid()>0.5)).sum(axis=0)

In [40]:
#pd.DataFrame(abs(selection_pred.sigmoid()-0.5)).min(axis=1)

In [41]:
#pd.DataFrame(abs(selection_pred.sigmoid()-0.5)).min(axis=1).rank()

In [42]:
PVE_input_selection['margin']=list(pd.DataFrame(abs(selection_pred.sigmoid()-0.5)).min(axis=1))
PVE_input_selection['margin_rank']=list(pd.DataFrame(abs(selection_pred.sigmoid()-0.5)).min(axis=1).rank())

In [43]:
PVE_input_selection.to_excel('selected_case.xlsx')

In [44]:
prob=pd.DataFrame(selection_pred.sigmoid())

In [45]:
prob.to_excel('selected_case_prob.xlsx')