In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from typing import *
from pprint import pprint

import bigbench.api.results as bb

from lass.log_handling import LogLoader, TaskLog
from lass.datasets import split_task_level, analyse, merge, huggingfaceify

In [2]:
loader = (LogLoader(logdir = Path('../artifacts/logs'))
        .with_tasks('paper-full')
        .with_model_families(['BIG-G T=0'])
        .with_model_sizes(['128b'])
        .with_shots([3])
        .with_query_types([bb.MultipleChoiceQuery])
)

train, test = split_task_level(loader, seed=42, test_fraction=0.2)

In [3]:
stats = merge(analyse(train), analyse(test), 'train', 'test')
del stats['task_names'] # Can't read anything anymore otherwise
pprint(stats)
train.head(1)

{'metrics': {'lm-acc': {'test': 0.3303313319991105,
                        'train': 0.3763097949886105},
             'lm-auc-absolute': {'test': 0.6642345372694248,
                                 'train': 0.612106246963259},
             'lm-auc-normalized': {'test': 0.635469089571542,
                                   'train': 0.6041244013492464}},
 'stats': {'n_instances': {'test': 8994, 'train': 46095},
           'n_instances_nonbinary': {'test': 111, 'train': 231},
           'n_tasks': {'test': 23, 'train': 95}}}


Unnamed: 0,input,targets,scores,target_values,correct,absolute_scores,normalized_scores,metrics,task,shots
0,\nIn the SIT-adversarial world a structure is ...,"[There is at most one yellow square.\n, There ...","[-5.295638084411621, -4.816563129425049, -3.96...","{'There are at least two red pieces. ': 0, 'Th...",0.0,"[-5.295638084411621, -4.816563129425049, -3.96...","[-5.226889610290527, -4.747814655303955, -3.90...",{'calibration_multiple_choice_brier_score': 0....,symbol_interpretation,3


In [4]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

import torch

print(torch.cuda.device_count())

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1
1


In [5]:
dataset = huggingfaceify(train, test)
dataset['train'][0]


{'text': '\nIn the SIT-adversarial world a structure is a sequence of six emojis.\nHereafter are reported the emojis used along with their descriptions.\n 🔺 is a red circle;\n 🟦 is a blue circle;\n 🔴 is a yellow circle;\n 🟥 is a red triangle pointing up;\n 🟨 is a red triangle pointing down;\n 🔻 is a red square;\n 🟡 is a blue square;\n _ is a yellow square;\n 🔵 is an empty space.\n\nChoose the sentence consistent with the structure 🔺 🔺 🟦 🔺 🟦 🟡 and not consistent with 🟨 🟨 🟦 🔴 🔵 🔵:\n\n  choice: There are at most two red pieces.\n\n  choice: There is exactly one red piece.\n\n  choice: There are exactly two yellow circles.\n\n  choice: There are at most two blue squares.\n\n  choice: There is at least one square.\n\nA: There is at least one square.\n\nChoose the sentence consistent with the structure _ 🔴 🟦 🔴 🔻 🟡 and not consistent with 🔵 🟨 🔺 🟡 _ 🔺:\n\n  choice: There are exactly two yellow circles.\n\n  choice: There is exactly one triangle.\n\n  choice: There are zero yellow squares.\n\n 

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")
    return tokenizer(examples["text"], padding="max_length", truncation=True)
    # return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=1024) # longformer
    # return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="np") #gpt-2
    # return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=2048) # xlnet

# tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/47 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [7]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(50))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42) #.select(range(50))
len(train_dataset), len(eval_dataset)

(46095, 8994)

In [8]:
import wandb
%env WANDB_PROJECT=lass
%env WANDB_LOG_MODEL=true
%env TOKENIZERS_PARALLELISM=true
wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
env: WANDB_PROJECT=lass
env: WANDB_LOG_MODEL=true
env: TOKENIZERS_PARALLELISM=true


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwschella[0m (use `wandb login --relogin` to force relogin)


True

In [9]:
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import BertModel, BertConfig
import numpy as np
from datasets import load_metric
import scipy
import torch

# model = BertModel(BertConfig.from_pretrained("bert-base-cased"))
model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2")
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=2)
# model.config.pad_token_id = model.config.eos_token_id
# model = AutoModelForSequenceClassification.from_pretrained("./test_trainer/checkpoint-13500", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("../artifacts/assessors/bert-bs32/checkpoint-3000", num_labels=2)

training_args = TrainingArguments(
    output_dir="albert-base-v2-bs32-3sh-task-split",
    run_name="albert-base-v2-bs32-3sh-task-split",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    report_to="wandb",
    num_train_epochs=3,
    # max_steps=17277 - 3000,
)

metrics = {
    "accuracy": load_metric("accuracy"),
    "precision": load_metric("precision"),
    "recall": load_metric("recall"),
    "f1": load_metric("f1"),
    "roc_auc": load_metric("roc_auc"),
}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)    
    # ROC AUC metric requires probabilities instead of logits, and only of the "postive" class (="highest label" = 1).
    # Needs to change for multi-class or multi-label.
    prediction_scores = scipy.special.softmax(logits,axis=-1)[:,-1]
    return {
          "accuracy": metrics["accuracy"].compute(predictions=predictions, references=labels)["accuracy"],
          "precision": metrics["precision"].compute(predictions=predictions, references=labels)["precision"],
          "recall": metrics["recall"].compute(predictions=predictions, references=labels)["recall"],
          "f1": metrics["f1"].compute(predictions=predictions, references=labels)["f1"],
          "roc_auc": metrics["roc_auc"].compute(prediction_scores=prediction_scores, references=labels)["roc_auc"],
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [10]:
torch.cuda.empty_cache()
trainer.train()

wandb.finish()

The following columns in the training set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 46095
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4323
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.6253,0.602992,0.703024,0.560778,0.465836,0.508917,0.665452
2,0.601,0.619904,0.608628,0.435427,0.623023,0.5126,0.675861
3,0.5845,0.580572,0.670336,0.501201,0.421407,0.457853,0.696037


Saving model checkpoint to albert-base-v2-bs32-3sh-task-split/checkpoint-500
Configuration saved in albert-base-v2-bs32-3sh-task-split/checkpoint-500/config.json
Model weights saved in albert-base-v2-bs32-3sh-task-split/checkpoint-500/pytorch_model.bin
Saving model checkpoint to albert-base-v2-bs32-3sh-task-split/checkpoint-1000
Configuration saved in albert-base-v2-bs32-3sh-task-split/checkpoint-1000/config.json
Model weights saved in albert-base-v2-bs32-3sh-task-split/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8994
  Batch size = 32
Saving model checkpoint to albert-base-v2-bs32-3sh-task-split/checkpoint-1500
Configuration saved in albert-base-v2-bs32-3sh-task-split/checkpoint-1500/confi

VBox(children=(Label(value='44.592 MB of 44.592 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

0,1
eval/accuracy,█▁▆
eval/f1,██▁
eval/loss,▅█▁
eval/precision,█▁▅
eval/recall,▃█▁
eval/roc_auc,▁▃█
eval/runtime,▁▇█
eval/samples_per_second,█▂▁
eval/steps_per_second,█▂▁
train/epoch,▁▂▃▃▄▅▅▆▆▇██

0,1
eval/accuracy,0.67034
eval/f1,0.45785
eval/loss,0.58057
eval/precision,0.5012
eval/recall,0.42141
eval/roc_auc,0.69604
eval/runtime,72.3687
eval/samples_per_second,124.28
eval/steps_per_second,3.897
train/epoch,3.0


In [11]:
import wandb
# wandb.finish()