In [7]:
from json import dumps

import matplotlib.pyplot as plt
import numpy as np
import torch
import transformers
from tqdm.autonotebook import tqdm

import loader
from models import BinningCalibrator
import utils

split_ratio_test = 0.3
split_ratio_postproc = 0.1  # among all training data

# These seeds control the randomness for the post-process/test split and in
# postprocessing.  It does not affect pre-training data nor the randomness in
# pre-training, i.e., we assume the pre-trained predictor to be fixed.  Results
# will be aggregated over the seeds.
seeds = range(33, 38)

# This seed controls the randomness during pre-training (fixed)
seed_pretrain = 33

# Settings for calibration
split_ratio_calib = 0.3
n_bins_calib = 70

max_workers = 4
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

data_dir = "data/biasbios"

In [14]:
## Download BiasBios dataset

(dataset, labels, label_names, groups,
 group_names) = loader.load_biasbios(data_dir, add_sensitive_attribute=True)
dataset_u = loader.load_biasbios(data_dir, add_sensitive_attribute=False)[0]
display(
    loader.dataset_stats(dataset['title'], label_names, dataset['gender'],
                         group_names))

n_classes = len(label_names)
n_groups = len(group_names)

print('example from attribute-aware dataset:',
      dumps(dataset[seed_pretrain], indent=2))
print('example from attribute-blind dataset:',
      dumps(dataset_u[seed_pretrain], indent=2))

Group,female,male
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
accountant,2081,3571
architect,2398,7715
attorney,12494,20113
chiropractor,690,1908
comedian,592,2207
composer,918,4682
dentist,5153,9326
dietitian,3689,289
dj,211,1274
filmmaker,2310,4699


labels = [26 21  2 ... 18 15 13]
example from attribute-aware dataset: {
  "bio": "Male. He produced scores of films including such as al-Dhareeh (the shrine), 1976, winner of the Cinema Institute Films\u2019 Award at the Documentary and Short Films Festival, Egypt, 1977; as well as the Kelibia Festival Award, Tunisia, 1978; al-Mahatta (The Station), winner of a major award at Oberhausen Short Film Festival, Germany, 1989; the EU Award at FESPACO Festival, Burkina Faso, 1990; The Silver Sword Award at Damascus festival, 1990; and The Silver Tanit Award, Carthage festival, Tunisia, 1991. Eltayeb has served as head of the Sudanese Film group for several terms and as secretary of the Sudanese Film club. He has written numerous articles on cinema, published in major Sudanese newspapers. He is currently working on a long fiction film, al-Siraj wal-attama (The Lantern and Darkness).",
  "title": 9,
  "gender": 1
}
example from attribute-blind dataset: {
  "bio": "He produced scores of films 

In [9]:
## Split data into (test + post-processing) and pre-training sets

split_dataset = dataset.train_test_split(
    test_size=(1 - split_ratio_test) * (1 - split_ratio_postproc),
    seed=seed_pretrain,
)
dataset_ = split_dataset['train']
dataset_pretrain = split_dataset['test']

# Encode the joint (A, Y) labels by flattening
dataset_u = dataset_u.add_column(
    'labels_ay',
    np.array(dataset_u['gender']) * n_classes + np.array(dataset_u['title']))

split_dataset_u = dataset_u.train_test_split(
    test_size=(1 - split_ratio_test) * (1 - split_ratio_postproc),
    seed=seed_pretrain,
)
dataset_u_ = split_dataset_u['train']
dataset_pretrain_u = split_dataset_u['test']

labels_ = np.array(dataset_['title'])
groups_ = np.array(dataset_['gender'])

n_samples = len(dataset)
n_test = int(n_samples * split_ratio_test)

In [10]:
model_name = "bert-base-uncased"

In [11]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)


def tokenize_function(examples):
  tokenized_examples = tokenizer(examples["bio"],
                                 padding=False,
                                 max_length=tokenizer.model_max_length,
                                 truncation=True)
  return tokenized_examples


tokenized_dataset_ = dataset_.map(
    tokenize_function,
    batched=True,
    remove_columns=['bio'],
    desc="Running tokenizer",
)
tokenized_dataset_pretrain = dataset_pretrain.map(
    tokenize_function,
    batched=True,
    remove_columns=['bio'],
    desc="Running tokenizer",
)
tokenized_dataset_u_ = dataset_u_.map(
    tokenize_function,
    batched=True,
    remove_columns=['bio'],
    desc="Running tokenizer",
)
tokenized_dataset_pretrain_u = dataset_pretrain_u.map(
    tokenize_function,
    batched=True,
    remove_columns=['bio'],
    desc="Running tokenizer",
)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Running tokenizer:   0%|          | 0/145566 [00:00<?, ? examples/s]

Running tokenizer:   0%|          | 0/247857 [00:00<?, ? examples/s]

Running tokenizer:   0%|          | 0/145566 [00:00<?, ? examples/s]

Running tokenizer:   0%|          | 0/247857 [00:00<?, ? examples/s]

In [12]:
data_collator = transformers.DataCollatorWithPadding(tokenizer)


def train(model, dataset_train, dataset_test, label_col_name, batch_size,
          n_epochs, lr, warmup_ratio, weight_decay, max_grad_norm):

  dataloader_train = torch.utils.data.DataLoader(
      dataset_train,
      shuffle=True,
      collate_fn=data_collator,
      batch_size=batch_size,
  )
  dataloader_test = torch.utils.data.DataLoader(
      dataset_test,
      collate_fn=data_collator,
      batch_size=batch_size,
  )

  no_decay = ["bias", "LayerNorm.weight"]
  optimizer_grouped_parameters = [
      {
          "params": [
              p for n, p in model.named_parameters()
              if not any(nd in n for nd in no_decay)
          ],
          "weight_decay": weight_decay,
      },
      {
          "params": [
              p for n, p in model.named_parameters()
              if any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.0
      },
  ]
  optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
  scheduler = transformers.get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=(warmup_ratio * n_epochs * len(dataloader_train)),
      num_training_steps=n_epochs * len(dataloader_train))

  model_input_args = list(model.forward.__code__.co_varnames)
  loss_fn = torch.nn.CrossEntropyLoss()

  for epoch in range(n_epochs):

    model.train()
    for batch in tqdm(dataloader_train, desc=f"train epoch {epoch+1}"):
      batch = {k: v.to(device) for k, v in batch.items()}
      optimizer.zero_grad()
      outputs = model(**{
          k: v for k, v in batch.items() if k in model_input_args
      })
      loss = loss_fn(outputs.logits, batch[label_col_name])
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
      optimizer.step()
      scheduler.step()

    model.eval()
    with torch.no_grad():
      test_loss = 0
      test_acc = 0
      for batch in tqdm(dataloader_test, desc=f"test {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**{
            k: v for k, v in batch.items() if k in model_input_args
        })
        probas = outputs.logits
        test_loss += loss_fn(probas, batch[label_col_name]).item()
        probas_y = probas.softmax(dim=1).reshape(len(batch['title']), -1,
                                                 n_classes).sum(dim=1)
        test_acc += (probas_y.argmax(dim=1) == batch['title']).sum().item()
      test_loss /= len(dataset_test['title'])
      test_acc /= len(dataset_test['title'])
      print(
          f"epoch {epoch+1}/{n_epochs}: loss={test_loss:.4f}, acc={test_acc:.4f}"
      )


def predict_probas(model, dataset, batch_size):
  dataloader = torch.utils.data.DataLoader(
      dataset,
      collate_fn=data_collator,
      batch_size=batch_size,
  )
  model_input_args = list(model.forward.__code__.co_varnames)
  model.eval()
  with torch.no_grad():
    probas = []
    for batch in tqdm(dataloader, desc="inference"):
      batch = {
          k: v.to(device) for k, v in batch.items() if k in model_input_args
      }
      outputs = model(**batch)
      probas.append(outputs.logits.softmax(dim=1).cpu().numpy())
    probas = np.concatenate(probas, axis=0)
    return probas

In [13]:
## (Pre-)train predictors

batch_size = 32
n_epochs = 3
lr = 2e-5
warmup_ratio = 0.1
weight_decay = 0.01
max_grad_norm = 1.0

# Train attribute-aware p(Y | X) predictor
transformers.set_seed(seed_pretrain)
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=n_classes).to(device)
train(
    model,
    tokenized_dataset_pretrain,
    tokenized_dataset_.train_test_split(test_size=0.1,
                                        seed=seed_pretrain)['test'],
    'title',
    batch_size,
    n_epochs,
    lr,
    warmup_ratio,
    weight_decay,
    max_grad_norm,
)

# Train attribute-blind p(A, Y | X) predictor
transformers.set_seed(seed_pretrain)
model_u = transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=n_groups * n_classes).to(device)

"""
train(
    model_u,
    tokenized_dataset_pretrain_u,
    tokenized_dataset_u_.train_test_split(test_size=0.1,
                                          seed=seed_pretrain)['test'],
    'labels_ay',
    batch_size,
    n_epochs,
    lr,
    warmup_ratio,
    weight_decay,
    max_grad_norm,
)
"""



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train epoch 1:   0%|          | 0/7746 [00:00<?, ?it/s]

KeyboardInterrupt: 