In [1]:
from pathlib import Path
from typing import *

import bigbench.api.results as bb

from lass.log_handling import LogLoader, TaskLog

In [2]:
loader = (LogLoader(logdir = Path('../artifacts/logs'))
        .with_tasks('paper-full')
        .with_model_families(['BIG-G T=0'])
        .with_model_sizes(['128b'])
        .with_shots([3])
        .with_query_types([bb.MultipleChoiceQuery])
)

tasks: List[bb.ResultsFileData] = list(loader.load_per_model())
print(f"{len(tasks)} tasks loaded")

176 tasks loaded


In [3]:
n_queries = sum(len(log.queries or []) for log in tasks)
print(f"{n_queries} queries")

337 queries


In [4]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

import torch

print(torch.cuda.device_count())

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1
1


In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd

train_tasks, test_tasks = train_test_split(tasks, test_size=0.2, random_state=42)

def to_dataframe(tasks: List[bb.ResultsFileData]) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    for task in tasks:
        for query in (task.queries or []):
            dfs.append(pd.DataFrame(query.samples))
    return pd.concat(dfs)

df_train = to_dataframe(train_tasks)
df_test = to_dataframe(test_tasks)

print("Original data:")
display(df_train.head(1))
display(df_train['correct'].value_counts().head(4))


Original data:


Unnamed: 0,input,targets,scores,target_values,correct,absolute_scores,normalized_scores,metrics
0,\nIn the SIT-adversarial world a structure is ...,"[There is at most one yellow square.\n, There ...","[-5.295638084411621, -4.816563129425049, -3.96...","{'There are at least two red pieces. ': 0, 'Th...",0.0,"[-5.295638084411621, -4.816563129425049, -3.96...","[-5.226889610290527, -4.747814655303955, -3.90...",{'calibration_multiple_choice_brier_score': 0....


0.0    28749
1.0    17346
0.8       72
0.6       64
Name: correct, dtype: int64

In [7]:
def huggingfaceify(df: pd.DataFrame) -> pd.DataFrame:
    df_hf = df[['input', 'correct']].rename(columns={'input':'text','correct':'label'})
    df_hf = df_hf[df_hf['label'].isin([0.0, 1.0])]
    df_hf[['label']] = df_hf[['label']].astype(int)
    return df_hf


print("\n\nHuggingface ready data:")
dfhf_train = huggingfaceify(df_train)
dfhf_test = huggingfaceify(df_test)
display(dfhf_train.head(1))
display(dfhf_train['label'].value_counts().head(5))




Huggingface ready data:


Unnamed: 0,text,label
0,\nIn the SIT-adversarial world a structure is ...,0


0    28749
1    17346
Name: label, dtype: int64

In [8]:
from datasets import Dataset, DatasetDict


display(f"Train accuracy: {dfhf_train['label'].mean():.2f} ({len(dfhf_train)} instances)")
display(f"Test accuracy: {dfhf_test['label'].mean():.2f} ({len(dfhf_test)} instances)")

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(dfhf_train, split='train')
ds['test'] = Dataset.from_pandas(dfhf_test, split='test')

dataset = ds
ds['train'][0]

'Train accuracy: 0.38 (46095 instances)'

'Test accuracy: 0.33 (8994 instances)'

{'text': '\nIn the SIT-adversarial world a structure is a sequence of six emojis.\nHereafter are reported the emojis used along with their descriptions.\n 游댴 is a red circle;\n 游릱 is a blue circle;\n 游댮 is a yellow circle;\n 游린 is a red triangle pointing up;\n 游릳 is a red triangle pointing down;\n 游댵 is a red square;\n 游리 is a blue square;\n _ is a yellow square;\n 游댯 is an empty space.\n\nChoose the sentence consistent with the structure 游댴 游댴 游릱 游댴 游릱 游리 and not consistent with 游릳 游릳 游릱 游댮 游댯 游댯:\n\n  choice: There are at most two red pieces.\n\n  choice: There is exactly one red piece.\n\n  choice: There are exactly two yellow circles.\n\n  choice: There are at most two blue squares.\n\n  choice: There is at least one square.\n\nA: There is at least one square.\n\nChoose the sentence consistent with the structure _ 游댮 游릱 游댮 游댵 游리 and not consistent with 游댯 游릳 游댴 游리 _ 游댴:\n\n  choice: There are exactly two yellow circles.\n\n  choice: There is exactly one triangle.\n\n  choice: There

In [9]:
# Analyse LM results
import math
import sklearn as sk
import numpy as np

lm_preds = df_test[df_test['correct'].isin([0.0, 1.0])].copy()

def confidence(row):
    return np.max(row['normalized_scores'])

# Assumes the actual output is arg_max
lm_preds['confidence_normalized'] = lm_preds.apply(lambda row: math.exp(np.max(row['normalized_scores'])), axis=1)
lm_preds['confidence_absolute'] = lm_preds.apply(lambda row: math.exp(np.max(row['absolute_scores'])), axis=1)

display(sk.metrics.roc_auc_score(lm_preds['correct'], lm_preds['confidence_normalized']))
display(sk.metrics.roc_auc_score(lm_preds['correct'], lm_preds['confidence_absolute']))

0.635469089571542

0.6642345372694248

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    # return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/47 [00:00<?, ?ba/s]

TypeError: Provided `function` which is applied to all elements of table returns a `dict` of types [<class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>]. When using `batched=True`, make sure provided `function` returns a `dict` of types like `(<class 'list'>, <class 'numpy.ndarray'>)`.

In [10]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(50))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42) #.select(range(50))
len(train_dataset), len(eval_dataset)

(46072, 8986)

In [11]:
import wandb
%env WANDB_PROJECT=lass
%env WANDB_LOG_MODEL=true
%env TOKENIZERS_PARALLELISM=true
wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


env: WANDB_PROJECT=lass
env: WANDB_LOG_MODEL=true
env: TOKENIZERS_PARALLELISM=true


[34m[1mwandb[0m: Currently logged in as: [33mwschella[0m (use `wandb login --relogin` to force relogin)


True

In [12]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
import scipy
import torch.nn as nn

# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("./test_trainer/checkpoint-13500", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("../artifacts/assessors/bert-bs32/checkpoint-3000", num_labels=2)

training_args = TrainingArguments(
    output_dir="xlnet-bs32-task-split",
    run_name="xlnet-bs32-task-split",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    report_to="wandb",
    num_train_epochs=3
)

metrics = {
    "accuracy": load_metric("accuracy"),
    "precision": load_metric("precision"),
    "recall": load_metric("recall"),
    "f1": load_metric("f1"),
    "roc_auc": load_metric("roc_auc"),
}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)    
    # ROC AUC metric requires probabilities instead of logits, and only of the "postive" class (="highest label" = 1).
    # Needs to change for multi-class or multi-label.
    prediction_scores = scipy.special.softmax(logits,axis=-1)[:,-1]
    return {
          "accuracy": metrics["accuracy"].compute(predictions=predictions, references=labels)["accuracy"],
          "precision": metrics["precision"].compute(predictions=predictions, references=labels)["precision"],
          "recall": metrics["recall"].compute(predictions=predictions, references=labels)["recall"],
          "f1": metrics["f1"].compute(predictions=predictions, references=labels)["f1"],
          "roc_auc": metrics["roc_auc"].compute(prediction_scores=prediction_scores, references=labels)["roc_auc"],
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [13]:
trainer.train()

wandb.finish()

The following columns in the training set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `XLNetForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 46072
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4320
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


ValueError: expected sequence of length 94 at dim 1 (got 215)

In [None]:
import wandb
# wandb.finish()