In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import *
from pathlib import Path
from dataclasses import dataclass
import math

from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
from transformers.models.auto.tokenization_auto import AutoTokenizer
from transformers.trainer import Trainer
import numpy as np
import pandas as pd
import bigbench.api.results as bb

from lass.log_handling import LogLoader
from lass.datasets import split_task_level, huggingfaceify
import lass
import lass.metrics.pandas

Couldn't find a directory or a metric named 'roc_auc' in this version. It was picked from the master branch on github instead.


In [10]:
loader = (LogLoader(logdir = Path('../artifacts/logs'))
        .with_tasks('paper-full')
        .with_model_families(['BIG-G T=0'])
        .with_model_sizes(['128b'])
        .with_shots([0])
        .with_query_types([bb.MultipleChoiceQuery])
)

_train, test = split_task_level(loader, seed=42, test_fraction=0.2)


Couldn't find a directory or a metric named 'roc_auc' in this version. It was picked from the master branch on github instead.


In [26]:
dataset = huggingfaceify(_train[:1], test)

In [27]:
import transformers
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [30]:
@dataclass
class Assessor():
  name: str
  path: Path

BASE = Path("../artifacts/assessors/")
transformers.logging.set_verbosity_error() # type: ignore
tokenizers = {
  "bert": AutoTokenizer.from_pretrained("bert-base-cased"),
  "roberta": AutoTokenizer.from_pretrained("roberta-base"),
  "albert": AutoTokenizer.from_pretrained("albert-base-v2"),
  "gpt2": AutoTokenizer.from_pretrained("gpt2"),
}
tokenizers["gpt2"].pad_token = tokenizers["gpt2"].eos_token

assessors = [
  Assessor("gpt2", BASE / "gpt2-bs8-0sh-task-split/checkpoint-5500"),
  Assessor("bert", BASE / "bert-bs32-0sh-task-split/checkpoint-1500"),
  Assessor("roberta", BASE / "roberta-bs32-0sh-task-split/checkpoint-1500"),
  Assessor("albert", BASE / "albert-bs32-0sh-task-split/checkpoint-1500"),
]

compute_metrics = lass.metrics.hf.get_metric_computer([
  "accuracy",
  "precision",
  "recall",
  "f1",
  "roc_auc",
  "brier_score",
])

results = []
for assessor in assessors:
  transformers.logging.set_verbosity_error() # type: ignore
  model = AutoModelForSequenceClassification.from_pretrained(assessor.path)

  # Tokenize according to specific model tokenizer
  tokenizer = tokenizers[assessor.name]
  def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="np")
  tokenized_datasets = dataset.map(tokenize, batched=True)

  # Dummy Trainer for easy batched predictions
  dummy_trainer = Trainer(model=model, compute_metrics=compute_metrics)

  predictions, labels, metrics = dummy_trainer.predict(tokenized_datasets['test']) #type: ignore
  print(metrics)
  assert metrics is not None

  metrics = {key: metrics[f"test_{key}"] for key in ["accuracy", "brier_score", "roc_auc"]}
  results.append({"model": assessor.name, **metrics})

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/wout/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/resolve/mai

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 8986
  Batch size = 8


{'test_loss': 0.6059878468513489, 'test_accuracy': 0.7089917649677275, 'test_precision': 0.556, 'test_recall': 0.34614016364283173, 'test_f1': 0.426660819995615, 'test_roc_auc': 0.6477325774826197, 'test_brier_score': 0.20891142745016086, 'test_runtime': 194.7615, 'test_samples_per_second': 46.138, 'test_steps_per_second': 2.886}


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 8986
  Batch size = 8


{'test_loss': 0.5663542747497559, 'test_accuracy': 0.6871800578677943, 'test_precision': 0.5, 'test_recall': 0.5613660618996799, 'test_f1': 0.5289089994972348, 'test_roc_auc': 0.7332434896452197, 'test_brier_score': 0.19392482998343866, 'test_runtime': 73.5563, 'test_samples_per_second': 122.165, 'test_steps_per_second': 7.64}


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 8986
  Batch size = 8


{'test_loss': 0.6038299202919006, 'test_accuracy': 0.6031604718450924, 'test_precision': 0.39733478379113407, 'test_recall': 0.5197438633938101, 'test_f1': 0.4503699136868064, 'test_roc_auc': 0.6729864312698667, 'test_brier_score': 0.20940206359910565, 'test_runtime': 78.7742, 'test_samples_per_second': 114.073, 'test_steps_per_second': 7.134}


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 8986
  Batch size = 8


{'test_loss': 0.5871859788894653, 'test_accuracy': 0.7129979968840419, 'test_precision': 0.5878787878787879, 'test_recall': 0.2760583422269655, 'test_f1': 0.3756959573953038, 'test_roc_auc': 0.6657050597925731, 'test_brier_score': 0.20168135298967985, 'test_runtime': 69.1283, 'test_samples_per_second': 129.99, 'test_steps_per_second': 8.13}


In [31]:
test = test[test['correct'].isin([0.0, 1.0])]
LM_conf_normalized = test.apply(lambda row: math.exp(np.max(row['normalized_scores'])), axis=1)
LM_conf_absolute = test.apply(lambda row: math.exp(np.max(row['absolute_scores'])), axis=1)

results.append({
    "model": "LM_normalized",
    **lass.metrics.pandas.confidence_metrics(LM_conf_normalized, test['correct']) # type: ignore
})
results.append({
    "model": "LM_absolute",
    **lass.metrics.pandas.confidence_metrics(LM_conf_absolute, test['correct']) # type: ignore
})

df = pd.DataFrame(results)
df

Unnamed: 0,model,accuracy,brier_score,roc_auc
0,gpt2,0.708992,0.208911,0.647733
1,bert,0.68718,0.193925,0.733243
2,roberta,0.60316,0.209402,0.672986
3,albert,0.712998,0.201681,0.665705
4,LM_normalized,0.490875,0.372616,0.590341
5,LM_absolute,0.691409,0.270685,0.551438
