In [1]:
import json
from pprint import pprint, pformat
from pathlib import Path
from dataclasses import dataclass

import bigbench.api.results as bb

from lmasss.log_handling import LogLoader

# Creating test set

Things to keep in mind:
- Instances are used for multiple systems. These multiple results should not be split across train and test.
- Instances are used for multiple n-shots. These multiple results should not be split across train and test.
- Not all models have results available for all tasks, specifically, BIG-G sparse results are not available for multiple tasks.

In [2]:
loader = (LogLoader(logdir = Path('../artifacts/logs'))
        .with_output_unit('sample')
        .with_tasks('paper-full')
        .with_model_families(['BIG-G T=0'])
        .with_model_sizes(['128b'])
        .with_shots([0])
        .with_query_types([bb.MultipleChoiceQuery])
)

samples = list(loader.load())
print(f"{len(samples)} samples\nSample #0:\n{pformat(vars(samples[0]))} ")

55431 samples
Sample #0:
{'absolute_scores': [-17.807415008544922,
                     -18.298959732055664,
                     -12.581655502319336,
                     -17.21178436279297,
                     -31.068716049194336],
 'correct': 0.0,
 'input': 'In what follows, we provide short narratives, each of which '
          'illustrates a common proverb. \n'
          'Narrative: Carla was having trouble juggling college, working at '
          'the diner and being a mother. She never had a day off and was burnt '
          'out. Her friend told her that her hard work would pay off one day '
          "and to keep going! Carla's friend was right; after a few more years "
          'of hard work, Carla graduated school and was able to get a better '
          'job. She was able to take a vacation and become overall '
          'successful.\n'
          'This narrative is a good illustration of the following proverb: ',
 'metrics': {'calibration_multiple_choice_brier_score': 0.3

In [3]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

import torch

print(torch.cuda.device_count())

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1
1


In [4]:
import pandas as pd

print("Original data:")
df = pd.DataFrame(samples)
display(df.head(1))
display(df['correct'].value_counts().head(4))

print("\n\nHuggingface ready data:")
# df_hf stands for HuggingFace dataset compatible DataFrame
df_hf = df[['input', 'correct']].rename(columns={'input':'text','correct':'label'})
df_hf = df_hf[df_hf['label'].isin([0.0, 1.0])]
df_hf[['label']] = df_hf[['label']].astype(int)
display(df_hf.head(1))
display(df_hf['label'].value_counts().head(5))



Original data:


Unnamed: 0,input,targets,scores,target_values,correct,absolute_scores,normalized_scores,metrics
0,"In what follows, we provide short narratives, ...","[Seek and you shall find, It's better to light...","[-17.807415008544922, -18.298959732055664, -12...","{'April showers bring forth May flowers': 1, '...",0.0,"[-17.807415008544922, -18.298959732055664, -12...","[-5.244009971618652, -5.7355546951293945, -0.0...",{'calibration_multiple_choice_brier_score': 0....


0.0    36548
1.0    18510
0.8       85
0.6       63
Name: correct, dtype: int64



Huggingface ready data:


Unnamed: 0,text,label
0,"In what follows, we provide short narratives, ...",0


0    36548
1    18510
Name: label, dtype: int64

In [5]:
from datasets import Dataset, DatasetDict

test_fraction = 0.2

# Per instance
df_hf_train=df_hf.sample(frac=(1-test_fraction),random_state=1234)
df_hf_test=df_hf.drop(df_hf_train.index)

# Per task (approx)
# df_hf_train = df.iloc[:900,:]
# df_hf_test = df.iloc[900:,:]

display(f"Train accuracy: {df_hf_train['label'].mean():.2f} ({len(df_hf_train)} instances)")
display(f"Test accuracy: {df_hf_test['label'].mean():.2f} ({len(df_hf_test)} instances)")

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(df_hf_train, split='train')
ds['test'] = Dataset.from_pandas(df_hf_test, split='test')

dataset = ds
ds['train'][0]

'Train accuracy: 0.34 (44046 instances)'

'Test accuracy: 0.34 (11012 instances)'

{'text': 'If you follow these instructions, do you return to the starting point?\nQ: Turn left. Turn left. Take 5 steps. Turn around. Take 5 steps.\nA: ',
 'label': 1,
 '__index_level_0__': 34229}

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/45 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [7]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(50))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42) #.select(range(50))
len(train_dataset), len(eval_dataset)

(44046, 11012)

In [8]:
import wandb
%env WANDB_PROJECT=langasss
%env WANDB_LOG_MODEL=true
%env TOKENIZERS_PARALLELISM=true
wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
env: WANDB_PROJECT=langasss
env: WANDB_LOG_MODEL=true
env: TOKENIZERS_PARALLELISM=true


[34m[1mwandb[0m: Currently logged in as: [33mwschella[0m (use `wandb login --relogin` to force relogin)


True

In [9]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
import scipy
import torch.nn as nn

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("./test_trainer/checkpoint-13500", num_labels=2)
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    report_to="wandb",
    run_name="bert-bs32",
    num_train_epochs=10
)

metrics = {
    "accuracy": load_metric("accuracy"),
    "precision": load_metric("precision"),
    "recall": load_metric("recall"),
    "f1": load_metric("f1"),
    "roc_auc": load_metric("roc_auc"),
}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)    
    # ROC AUC metric requires probabilities instead of logits, and only of the "postive" class (="highest label" = 1).
    # Needs to change for multi-class or multi-label.
    prediction_scores = scipy.special.softmax(logits,axis=-1)[:,-1]
    return {
          "accuracy": metrics["accuracy"].compute(predictions=predictions, references=labels)["accuracy"],
          "precision": metrics["precision"].compute(predictions=predictions, references=labels)["precision"],
          "recall": metrics["recall"].compute(predictions=predictions, references=labels)["recall"],
          "f1": metrics["f1"].compute(predictions=predictions, references=labels)["f1"],
          "roc_auc": metrics["roc_auc"].compute(prediction_scores=prediction_scores, references=labels)["roc_auc"],
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

wandb.finish()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.5862,0.587997,0.688249,0.581036,0.282801,0.380437,0.720573
2,0.5414,0.544373,0.709226,0.595005,0.441105,0.506626,0.753256
3,0.4947,0.569472,0.697239,0.55469,0.534746,0.544536,0.745098
4,0.4061,0.649657,0.705776,0.573587,0.509257,0.539511,0.739821
5,0.3238,0.747163,0.693607,0.552203,0.500939,0.525324,0.719459
6,0.2656,0.921601,0.693607,0.554125,0.48484,0.517172,0.709578
7,0.2235,1.101204,0.694788,0.557259,0.477864,0.514517,0.693583
8,0.1839,1.288037,0.69506,0.552729,0.518916,0.535289,0.687968
9,0.1563,1.402703,0.695877,0.555654,0.506305,0.529833,0.685668
10,0.13,1.633535,0.690792,0.547049,0.502281,0.52371,0.677489


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11012
  Batch size = 32
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configur

VBox(children=(Label(value='413.249 MB of 413.249 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
eval/accuracy,▁█▄▇▃▃▃▃▄▂
eval/f1,▁▆██▇▇▇█▇▇
eval/loss,▁▁▁▂▂▃▅▆▇█
eval/precision,▆█▂▅▂▂▂▂▂▁
eval/recall,▁▅█▇▇▇▆█▇▇
eval/roc_auc,▅█▇▇▅▄▂▂▂▁
eval/runtime,▂▁▄█▅▄▄█▂▃
eval/samples_per_second,▇█▅▁▄▅▅▁▇▆
eval/steps_per_second,▇█▅▁▄▅▅▁▇▆
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.69079
eval/f1,0.52371
eval/loss,1.63354
eval/precision,0.54705
eval/recall,0.50228
eval/roc_auc,0.67749
eval/runtime,71.1603
eval/samples_per_second,154.749
eval/steps_per_second,4.848
train/epoch,10.0


In [10]:
import wandb
# wandb.finish()