# Multi-label Classifier

This notebook implements a multi-label classifier that fine-tunes a BERT model to tell if a sentence contains problematic metaphors

<div hidden>
TODO: add extend data3/data.json with better data in the same format that actually makes sense.
</div>

## Imports and Setup

In [1]:
!pip install transformers -Uqq
!pip install sklearn -Uqq
!pip install datasets -Uqq
!pip install torch -Uqq
!pip install numpy -Uqq
!pip install evaluate -Uqq

In [3]:
import evaluate
import numpy as np
import torch
from datasets import Dataset, load_dataset
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
)



In [None]:
MODEL_NAME = "aihype_multi-label-bert"

## Loading Dataset

In [3]:
dataset = load_dataset("json", data_files="data3/data.json", field="data")
dataset

Using custom data configuration default-766ccb02cf791abc
Found cached dataset json (/home/xt0r3/.cache/huggingface/datasets/json/default-766ccb02cf791abc/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'agency', 'humanComparison', 'hyperbole', 'historyComparison', 'unjustClaims', 'deepSounding', 'sceptics', 'deEmphasize', 'performanceNumber', 'inscrutable', 'objective'],
        num_rows: 329
    })
})

In [4]:
dataset["train"][0:3]

{'text': ['A new vision of artificial intelligence for the people',
  'The gig workers fighting back against the algorithms',
  'How the AI industry profits from catastrophe'],
 'agency': [False, True, False],
 'humanComparison': [False, True, False],
 'hyperbole': [False, True, True],
 'historyComparison': [False, False, False],
 'unjustClaims': [False, False, False],
 'deepSounding': [False, False, False],
 'sceptics': [False, False, False],
 'deEmphasize': [False, False, False],
 'performanceNumber': [False, False, False],
 'inscrutable': [False, False, False],
 'objective': [False, False, False]}

In [5]:
labels = [label for label in dataset["train"].features.keys() if label not in ["text"]]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}
labels

['agency',
 'humanComparison',
 'hyperbole',
 'historyComparison',
 'unjustClaims',
 'deepSounding',
 'sceptics',
 'deEmphasize',
 'performanceNumber',
 'inscrutable',
 'objective']

## Preprocess Data, Create Train/Test Split

In [6]:
# dataset = dataset.class_encode_column("label")
dataset = dataset["train"].train_test_split(
    test_size=0.2
)  # , stratify_by_column="label")
#
# dataset

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

In [8]:
encoded_dataset = dataset.map(
    preprocess_data, batched=True, remove_columns=dataset["train"].column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Verify dataset

In [9]:
example = encoded_dataset["train"][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [10]:
tokenizer.decode(example["input_ids"])

'[CLS] How Apple personalizes Siri without hoovering up your data [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [11]:
example["labels"]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [12]:
[id2label[idx] for idx, label in enumerate(example["labels"]) if label == 1.0]

[]

In [13]:
encoded_dataset.set_format("torch")

In [14]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 263
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 66
    })
})

In [15]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [16]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  1731,  7302,  2357,  9534,  2203,  1182,  1443, 16358,  5909,
         1158,  1146,  1240,  2233,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

## Load Pre-Trained Model

In [17]:
# use_fast uses fast tokenizers backed by rust. Remove it if it causes errors
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(labels),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

### Verify data-model interaction

In [18]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.7275, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.3235, -0.0577,  0.1060, -0.0252,  0.2660, -0.2880, -0.2013,  0.2472,
          0.1995,  0.2474,  0.4223]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

## Define Metrics

In [19]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(y_true, y_pred, average="micro")
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

## Train the Model

In [20]:
batch_size = 1
metric_name = "f1"

In [21]:
training_args = TrainingArguments(
    MODEL_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # push_to_hub=True,  # TODO: enable once model seems good
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics,
)

In [23]:
trainer.train()

***** Running training *****
  Num examples = 263
  Num Epochs = 15
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3945
  Number of trainable parameters = 108318731


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.226728,0.0,0.5,0.606061
2,0.217300,0.223214,0.0,0.5,0.606061
3,0.217300,0.209764,0.0,0.5,0.606061
4,0.171400,0.201446,0.040816,0.509399,0.606061
5,0.171400,0.207337,0.150943,0.541272,0.590909
6,0.115000,0.211075,0.21875,0.567999,0.484848
7,0.115000,0.229822,0.196721,0.5586,0.515152
8,0.082600,0.224676,0.142857,0.539066,0.560606
9,0.082600,0.244213,0.2,0.559335,0.545455
10,0.063200,0.255359,0.107143,0.527462,0.545455


***** Running Evaluation *****
  Num examples = 66
  Batch size = 1
Saving model checkpoint to bert-finetuned-multi-label-ai-hype/checkpoint-263
Configuration saved in bert-finetuned-multi-label-ai-hype/checkpoint-263/config.json
Model weights saved in bert-finetuned-multi-label-ai-hype/checkpoint-263/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 66
  Batch size = 1
Saving model checkpoint to bert-finetuned-multi-label-ai-hype/checkpoint-526
Configuration saved in bert-finetuned-multi-label-ai-hype/checkpoint-526/config.json
Model weights saved in bert-finetuned-multi-label-ai-hype/checkpoint-526/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 66
  Batch size = 1
Saving model checkpoint to bert-finetuned-multi-label-ai-hype/checkpoint-789
Configuration saved in bert-finetuned-multi-label-ai-hype/checkpoint-789/config.json
Model weights saved in bert-finetuned-multi-label-ai-hype/checkpoint-789/pytorch_model.bin
***** Running Evaluation *****
  Num 

TrainOutput(global_step=3945, training_loss=0.09761825000680747, metrics={'train_runtime': 303.6426, 'train_samples_per_second': 12.992, 'train_steps_per_second': 12.992, 'total_flos': 259514247317760.0, 'train_loss': 0.09761825000680747, 'epoch': 15.0})

## Upload the Model

In [1]:
# trainer.push_to_hub()

In [4]:
??TrainingArguments

[0;31mInit signature:[0m
[0mTrainingArguments[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0moutput_dir[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moverwrite_output_dir[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_train[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_eval[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_predict[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mevaluation_strategy[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtrainer_utils[0m[0;34m.[0m[0mIntervalStrategy[0m[0;34m,[0m [0mstr[0m[0;34m][0m [0;34m=[0m [0;34m'no'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprediction_loss_only[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0