# Test Assessor Generalization w.r.t. distribution shift

We select the best/worst performing tasks for the training set, en the reverse for the test set.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from typing import *
from pprint import pprint

import bigbench.api.results as bb
import numpy as np

from lass.log_handling import LogLoader, TaskLog
from lass.datasets import analyse, merge, huggingfaceify, to_dataframe

In [3]:
loader = (LogLoader(logdir = Path('../artifacts/logs'))
        .with_tasks('paper-full')
        .with_model_families(['BIG-G T=0'])
        .with_model_sizes(['128b'])
        .with_shots([0])
        .with_query_types([bb.MultipleChoiceQuery])
)

data = to_dataframe(loader)

Split into good/bad performing training set, and reverse test set.

In [4]:

accs = (data
    .groupby('task', as_index=False).agg(acc=('correct', 'mean')) # type: ignore
    .sort_values('acc', ascending=False))

train_tasks, test_tasks = np.split(accs['task'], [int(len(accs) * 0.8)])
len(train_tasks), len(test_tasks)

train = data[data['task'].isin(train_tasks)]
test = data[data['task'].isin(test_tasks)]

stats = merge(analyse(train), analyse(test), 'train', 'test')
del stats['task_names'] # Can't read anything anymore otherwise
pprint(stats)
train.head(1)

{'metrics': {'conf-absolute': {'roc_auc': {'test': 0.48721947856416836,
                                           'train': 0.496735045546435}},
             'conf-normalized': {'roc_auc': {'test': 0.4934099804955599,
                                             'train': 0.5874931359228416}},
             'task-acc': {'test': 0.1310840005083238,
                          'train': 0.4182858596134283}},
 'stats': {'n_instances': {'test': 15738, 'train': 39320},
           'n_instances_nonbinary': {'test': 0, 'train': 373},
           'n_tasks': {'test': 24, 'train': 94}}}


Unnamed: 0,input,targets,scores,target_values,correct,absolute_scores,normalized_scores,metrics,task,shots
0,"In what follows, we provide short narratives, ...","[Seek and you shall find, It's better to light...","[-17.807415008544922, -18.298959732055664, -12...","{'April showers bring forth May flowers': 1, '...",0.0,"[-17.807415008544922, -18.298959732055664, -12...","[-5.244009971618652, -5.7355546951293945, -0.0...",{'calibration_multiple_choice_brier_score': 0....,abstract_narrative_understanding,0


In [5]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

import torch

print(torch.cuda.device_count())

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


1


In [6]:
dataset = huggingfaceify(train, test)
dataset['train'][0]

{'text': "In what follows, we provide short narratives, each of which illustrates a common proverb. \nNarrative: Carla was having trouble juggling college, working at the diner and being a mother. She never had a day off and was burnt out. Her friend told her that her hard work would pay off one day and to keep going! Carla's friend was right; after a few more years of hard work, Carla graduated school and was able to get a better job. She was able to take a vacation and become overall successful.\nThis narrative is a good illustration of the following proverb: ",
 'label': 0}

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")
    return tokenizer(examples["text"], padding="max_length", truncation=True)
    # return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=1024) # longformer
    # return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="np") #gpt-2
    # return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=2048) # xlnet

# tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)



  0%|          | 0/40 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

In [8]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(50))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42) #.select(range(50))
len(train_dataset), len(eval_dataset)

(39320, 15738)

In [9]:
import wandb
%env WANDB_PROJECT=lass
%env WANDB_LOG_MODEL=true
%env TOKENIZERS_PARALLELISM=true
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


env: WANDB_PROJECT=lass
env: WANDB_LOG_MODEL=true
env: TOKENIZERS_PARALLELISM=true


[34m[1mwandb[0m: Currently logged in as: [33mwschella[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import BertModel, BertConfig
import numpy as np
import scipy
import torch
import lass, lass.metrics.hf

# model = BertModel(BertConfig.from_pretrained("bert-base-cased"))
model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2")
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=2)
# model.config.pad_token_id = model.config.eos_token_id
# model = AutoModelForSequenceClassification.from_pretrained("./test_trainer/checkpoint-13500", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("../artifacts/assessors/bert-bs32/checkpoint-3000", num_labels=2)

training_args = TrainingArguments(
    output_dir="albert-bs32-0sh-wd-task-split-DS",
    run_name="albert-bs32-0sh-wd-task-split-DS",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    report_to="wandb",
    num_train_epochs=10,
    # weight_decay=0.05,
)

compute_metrics = lass.metrics.hf.get_metric_computer([
  "accuracy",
  "precision",
  "recall",
  "f1",
  "roc_auc",
  "brier_score",
])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
trainer.train()

wandb.finish()

The following columns in the training set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running training *****


  Num examples = 39320


  Num Epochs = 10


  Instantaneous batch size per device = 32


  Total train batch size (w. parallel, distributed & accumulation) = 32


  Gradient Accumulation steps = 1


  Total optimization steps = 12290


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Brier Score
1,0.6861,0.514936,0.868916,0.0,0.0,0.0,0.542905,0.163196
2,0.6874,0.558437,0.868916,0.0,0.0,0.0,0.569341,0.183564
3,0.6852,0.472194,0.868916,0.5,0.000485,0.000969,0.502613,0.144373
4,0.6653,0.519402,0.835748,0.100917,0.031992,0.048583,0.562973,0.165691
5,0.6458,0.455644,0.867073,0.128205,0.002424,0.004757,0.57561,0.140181
6,0.626,0.511688,0.798068,0.153511,0.119729,0.134532,0.579118,0.164186
7,0.5829,0.500608,0.771318,0.135328,0.138148,0.136723,0.57172,0.16196
8,0.5286,0.498883,0.782183,0.145455,0.135725,0.140421,0.574364,0.161238
9,0.4751,0.58672,0.753908,0.145931,0.180805,0.161507,0.581269,0.1874
10,0.4291,0.641758,0.757021,0.143378,0.171595,0.156222,0.581029,0.196205


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-500/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-1000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-1000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-1000/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-1500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-1500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-1500/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-2000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-2000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-2000/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-2500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-2500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-2500/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-3000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-3000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-3000/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-3500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-3500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-3500/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-4000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-4000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-4000/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-4500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-4500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-4500/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-5000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-5000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-5000/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-5500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-5500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-5500/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-6000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-6000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-6000/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-6500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-6500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-6500/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-7000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-7000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-7000/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-7500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-7500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-7500/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-8000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-8000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-8000/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-8500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-8500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-8500/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-9000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-9000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-9000/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-9500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-9500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-9500/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-10000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-10000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-10000/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-10500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-10500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-10500/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-11000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-11000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-11000/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-11500


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-11500/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-11500/pytorch_model.bin


Saving model checkpoint to albert-bs32-0sh-wd-task-split-DS/checkpoint-12000


Configuration saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-12000/config.json


Model weights saved in albert-bs32-0sh-wd-task-split-DS/checkpoint-12000/pytorch_model.bin


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 15738


  Batch size = 32




Training completed. Do not forget to share your model on huggingface.co/models =)




Saving model checkpoint to /tmp/tmp5514rb4w


Configuration saved in /tmp/tmp5514rb4w/config.json


Model weights saved in /tmp/tmp5514rb4w/pytorch_model.bin


VBox(children=(Label(value='44.592 MB of 44.592 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

0,1
eval/accuracy,███▆█▄▂▃▁▁
eval/brier_score,▄▆▂▄▁▄▄▄▇█
eval/f1,▁▁▁▃▁▇▇▇██
eval/loss,▃▅▂▃▁▃▃▃▆█
eval/precision,▁▁█▂▃▃▃▃▃▃
eval/recall,▁▁▁▂▁▆▆▆██
eval/roc_auc,▅▇▁▆▇█▇▇██
eval/runtime,██▇█▃▇▄▁▆▃
eval/samples_per_second,▁▁▂▁▆▂▅█▃▆
eval/steps_per_second,▁▁▂▁▆▂▅█▃▆

0,1
eval/accuracy,0.75702
eval/brier_score,0.19621
eval/f1,0.15622
eval/loss,0.64176
eval/precision,0.14338
eval/recall,0.17159
eval/roc_auc,0.58103
eval/runtime,153.387
eval/samples_per_second,102.603
eval/steps_per_second,3.208
