In [1]:
import pandas as pd
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm
import os
from datasets import load_dataset
from collections import defaultdict
from datasets import load_metric
from datasets import Dataset, DatasetDict

from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support


# import torch
torch.manual_seed(42)
np.random.seed(42)

In [2]:
model_name = "google-bert/bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Task 3.5: Multilabel classification

In the last part of the assignment, we'll work on the full data. Here, the task becomes a bit more complex. Each piece of text has between 0 and $k$ binary labels associated with it, specifying which of the $k$ values were observed. This type of task setup is called [_multilabel classification_](https://en.wikipedia.org/wiki/Multi-label_classification) where we want to predict multiple labels at the same time. You can contrast this with _multiclass classification_ where we want to predict which class of multiple classes is present, but we only make one prediction.

Ideally, we want to predict all of them at once! For some motivation, if we tried to predict them each individually as we did earlier, we'd need to train separate classifiers for each, which is very computationally expensive. As a second motivation, often there are some shared relationships between labels. When the model gets to train on multilabel data, you can get improved performance when the model learns the correlation/relationships between labels.

However, training a multilabel classifier will require us to modify how we set up the `Trainer` and model. In Task 3.5 you get to see another example of how to train using this new task type.

## Task 3.5.1 Loading and preparing the data

Start by loading the train, dev, and test `DataFrames` for the multilabel files provided with the assignment. These files will have many more columns indicating the presence of different values.

In [3]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [4]:
train_data_path = 'si630-w24-train.multilabel.tsv'
test_data_path = 'si630-w24-test.multilabel.tsv'
dev_data_path = 'si630-w24-dev.multilabel.tsv'

train_df = pd.read_csv(train_data_path, sep='\t')
test_df = pd.read_csv(test_data_path, sep='\t')
dev_df = pd.read_csv(dev_data_path, sep='\t')

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dev_dataset = Dataset.from_pandas(dev_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': dev_dataset
})

## Task 3.5.2: Preparing multilabel data

Like in the earlier binary classifier, we need to get a list of labels for our data and a way of mapping them to their index. This time, we'll have more labels though. The major new wrinkle will come when we preprocess the data. Here, we'll need to encode our multilabel ground truth as a binary vector indicating which labels were present.

In [5]:
labels = train_df.columns.tolist()[2:]  
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
labels

['Self-direction: thought',
 'Self-direction: action',
 'Stimulation',
 'Hedonism',
 'Achievement',
 'Power: dominance',
 'Power: resources',
 'Face',
 'Security: personal',
 'Security: societal',
 'Tradition',
 'Conformity: rules',
 'Conformity: interpersonal',
 'Humility',
 'Benevolence: caring',
 'Benevolence: dependability',
 'Universalism: concern',
 'Universalism: nature',
 'Universalism: tolerance',
 'Universalism: objectivity']

In [6]:
def preprocess_data(examples):
    texts = examples["text"]
    tokenized_inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="np")

    label_data = {label: examples[label] for label in labels}
    batch_size = len(texts)
    num_labels = len(labels)
    label_matrix = np.zeros((batch_size, 20))
    for i, label in enumerate(labels):
        label_vals = label_data[label]
        for j, val in enumerate(label_vals):
            if val == 1:
                label_matrix[j, i] = 1

    tokenized_inputs["labels"] = label_matrix
    
    return tokenized_inputs

In [7]:
multilabel_ds = dataset_dict.map(preprocess_data, batched=True)

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

### Exploration time

Let's get a sense of what is going on in the above. If it's helpful, try walking through these steps manually too.

In [8]:
text = "This is a test of the tokenizer."
encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
print(type(encoding))
print(encoding)

<class 'transformers.tokenization_utils_base.BatchEncoding'>
{'input_ids': [101, 1188, 1110, 170, 2774, 1104, 1103, 22559, 17260, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [9]:
example = multilabel_ds['train'][0]

print(tokenizer.decode(example['input_ids']))

[CLS] We should ban human cloning because as it will only cause huge issues when you have a bunch of the same humans running around all acting the same. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [10]:
example['labels']

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [11]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['Security: societal']

## Task 3.5.3 Setting up the model and `TrainingArguments`

Now that the data is ready for us to use, let's create the model and `Trainer`. This time, we'll add a few more arguments when we load our `AutoModel`:

- `problem_type` - we can specify what kind of problem we're going to train based on the pre-trained model. Here, we'll use "multi_label_classification"
- `id2label` - the dictionary we just created from IDs to the label name
- `label2id` - the dictionary we just created from label names to IDs

We'll still need to set the `num_labels` argument too.

In [12]:
model_name = "google-bert/bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=20,
    problem_type="multi_label_classification", 
    id2label=id2label, 
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
multilabel_training_args = TrainingArguments(
    output_dir="./results_multilabel", 
    overwrite_output_dir=True,  
    learning_rate=2e-5,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8, 
    num_train_epochs=10, 
    evaluation_strategy="steps", 
    eval_steps=50, 
    save_strategy="no",
    do_eval=True,
    logging_dir="./logs_multilabel",
    report_to=["wandb"],
    seed=12345
)

In [14]:
from transformers import Trainer, TrainingArguments, TrainerCallback

class MyBestModelSaver(TrainerCallback):
    def __init__(self):
        self.best_metric = float('-inf')
        self.best_model = None

    def on_evaluate(self, args, state, control, **kwargs):
        metrics = kwargs.get('metrics', {})
        eval_metric = metrics.get('eval_f1', 0)
        if eval_metric > self.best_metric:
            self.best_metric = eval_metric
            self.best_model = kwargs['model'].state_dict()

    def on_train_end(self, args, state, control, **kwargs):
        if self.best_model is not None:
            model_path = "./best_model2"
            torch.save(self.best_model, model_path)
            print(f"Best model saved to {model_path} with F1 score: {self.best_metric}")

## Task 3.3.4: Multilabel Evaluate Metric

Our previous `compute_metrics` function used the metrics designed for binary prediction. We'll need to update the function slightly here so that we can score our multilabel predictions. Thankfully, the `sklearn` functions for scoring _can_ support multilabel predictions so we won't need to change those _but_ they aren't designed for binary so we'll switch to "micro" averaging.

In [15]:
def compute_multilabel_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    predictions = np.round(probs)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='micro')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [16]:
multilabel_trainer = Trainer(
    model=model,
    args=multilabel_training_args,
    train_dataset=multilabel_ds["train"],
    eval_dataset=multilabel_ds["validation"],
    compute_metrics=compute_multilabel_metrics,
    data_collator=data_collator,
    callbacks=[MyBestModelSaver()],
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Train the model!
multilabel_trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33myanzhuo[0m ([33myanzhuoteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Precision,Recall,F1
50,No log,0.429205,0.0,0.0,0.0
100,No log,0.410953,0.0,0.0,0.0
150,No log,0.401465,0.0,0.0,0.0
200,No log,0.39424,0.626904,0.116528,0.196526
250,No log,0.385316,0.711849,0.124705,0.212231
300,No log,0.378306,0.707237,0.135241,0.227063
350,No log,0.371848,0.709841,0.15427,0.253456
400,No log,0.370021,0.73354,0.148923,0.247582
450,No log,0.363919,0.722354,0.191068,0.302201
500,0.402100,0.361834,0.685393,0.220632,0.333809


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Evaluate the model on the dev set
multilabel_trainer.evaluate()

In [None]:
test_predictions = multilabel_trainer.predict(tokenized_datasets["test"])

In [None]:
new_labels = np.zeros((labels.size, 2))
for i, label in enumerate(labels):
    new_labels[i, label] = 1

print(new_labels.shape)

precision, recall, f1, _ = precision_recall_fscore_support(new_labels, test_predictions, average='micro')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

### Exploration

Let's see what exactly our model can do. Often it's helpful to work through simple examples of inputs and outputs to get a sense of what data and datatypes are flowing through the system.

In [None]:
# First, let's grab a text from the test set to see what the model predicts
text = multilabel_ds['test']['text'][2]
print(text)

# Now let's tokenize the text
encoding = tokenizer(text, return_tensors="pt")

# We need to move the encoding to the device the model is on
encoding = {k: v.to(multilabel_trainer.model.device) for k,v in encoding.items()}

# Now let's get the model's predictions
outputs = multilabel_trainer.model(**encoding)
print(outputs)

In [None]:
# Note that the outputs are a specific type of object that has the logits
logits = outputs.logits
print(logits.shape)
print(logits)

In [None]:
# We'll turn the logits into probabilities using the sigmoid function
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())

# Round the probabilities to get the predicted labels.
# Remember, these are whether each value label being present.
predictions = probs.detach().numpy().round()

# Turn predictions into actual label names using our
# id2label dictionary
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)